# # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """ GraphML exporter for TCF graphs. """ import os.path from lxml import etree from tcflib import tcf from tcflib.service import ExportingWorker, run_as_cli class GraphMLWorker(ExportingWorker): def export(self): input_tree = self.corpus.tree xslt_file = os.path.join(os.path.dirname(__file__), 'data', 'tcf2graphml.xsl') xslt_tree = etree.parse(xslt_file) transform = etree.XSLT(xslt_tree) output_tree = transform(input_tree) return etree.tostring(output_tree, encoding='utf8', pretty_print=True) if __name__ == '__main__': run_as_cli(GraphMLWorker)
window=True) def build_graph_textspan_real(self, textspans, window=False): graph = tcf.Graph(label=self.options.label, weight=self.options.weight) if window: # Do not use textspans directly, but use windows of x textspans. textspans_old, textspans = list(textspans), [] for window in self.options.window: for n_gram in n_grams(textspans_old, window): span = tcf.TextSpan() for span_old in n_gram: span.tokens.extend(span_old.tokens) textspans.append(span) n = len(textspans) for i, span in enumerate(textspans, start=1): logging.debug('Creating network for textspan {}/{}.'.format(i, n)) tokens = set([token for token in span.tokens if self.test_token(token)]) logging.debug('Using {} tokens.'.format(len(tokens))) for token in tokens: graph.node_for_token(token) for combo in combinations(tokens, 2): try: graph.edge_for_tokens(*combo, unique=self.options.unique) except tcf.LoopError: continue return graph if __name__ == '__main__': run_as_cli(CooccurrenceWorker)
# # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """ JSON exporter for TCF graphs. """ import os.path from lxml import etree from tcflib import tcf from tcflib.service import ExportingWorker, run_as_cli class JSONWorker(ExportingWorker): def export(self): input_tree = self.corpus.tree xslt_file = os.path.join(os.path.dirname(__file__), 'data', 'tcf2json.xsl') xslt_tree = etree.parse(xslt_file) transform = etree.XSLT(xslt_tree) output = str(transform(input_tree)) return output if __name__ == '__main__': run_as_cli(JSONWorker)
for dependent in self.find_dependents(parse, head): for dependent_edge in self.find_edges( parse, dependent): yield dependent_edge def find_dependents(self, parse, head, descend=True): """ Generator method that returns all filtered dependents of a given head. If the direct dependents of a head are filtered out and `descend` is True, it looks for their dependents until it finds valid ones. :parameters: - `parse`: A parse element. - `head`: The ID of the head element. - `descend`: Descend the parse tree to find valid tokens. :returns: - yields dependent's IDs. """ for dependent in parse.find_dependents(head): if self.test_token(dependent): yield dependent elif descend: for dependent2 in self.find_dependents(parse, dependent): yield dependent2 if __name__ == '__main__': run_as_cli(DependencyWorker)
for token in elem.tokens: if token in tokens_to_keep: keep = True continue if not keep: removable.append(elem) if isinstance(layer, AnnotationLayer): # List-like interface for elem in removable: layer.remove(elem) elif isinstance(layer, AnnotationLayerWithIDs): # Dict-like interface for elem in removable: del layer[elem.id] # Step 3: Remove obsolete tokens removable = [] for token in self.corpus.tokens: if not token in tokens_to_keep: removable.append(token) for token in removable: del self.corpus.tokens[token.id] # Remove old layer old_layer = self.corpus._tree.find(f'//{P_TEXT}tokens') old_layer.getparent().remove(old_layer) # Add to `new_layers` to force re-serialization self.corpus.new_layers.insert(0, 'tokens') # Make sure it’s the first layer if __name__ == '__main__': run_as_cli(WordSampler)
result.append(tail) break return result class NltkTokenizer(AddingWorker): def add_annotations(self): # Add base layers self.corpus.add_layer(Tokens()) self.corpus.add_layer(Sentences()) self.corpus.add_layer(TextStructure()) # Parse text text = self.corpus.text.text paragraphs = listsplit(text.splitlines(), '') paragraphs = ['\n'.join(lines) for lines in paragraphs] for paragraph in paragraphs: textspan = TextSpan(type='paragraph') for sent in sent_tokenize(paragraph): sentence = Sentence() for word in word_tokenize(sent): token = Token(word) self.corpus.tokens.append(token) sentence.tokens.append(token) textspan.tokens.append(token) self.corpus.sentences.append(sentence) self.corpus.textstructure.append(textspan) if __name__ == '__main__': run_as_cli(NltkTokenizer)
if hasattr(self.corpus, 'postags'): columns['POStag'] = [token.tag for token in self.corpus.tokens] if hasattr(self.corpus, 'lemmas'): columns['lemma'] = [token.lemma for token in self.corpus.tokens] if hasattr(self.corpus, 'wsd'): columns['wordsenses'] = [ ', '.join(token.wordsenses) for token in self.corpus.tokens ] if hasattr(self.corpus, 'namedentities'): entities = [] for token in self.corpus.tokens: if not token.entity: entities.append('') elif token == token.entity.tokens[0]: entities.append('B-{}'.format(token.entity.class_)) else: entities.append('I-{}'.format(token.entity.class_)) columns['NamedEntity'] = entities # Write to CSV with StringIO(newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(list(columns.keys())) for row in zip(*columns.values()): writer.writerow(row) outstring = csvfile.getvalue() return outstring.encode('utf-8') if __name__ == '__main__': run_as_cli(CSVExporter)
# token ID as node label for now. We replace it with the token text # later. node_a = nodes.find_node(a_id) if node_a is None: node_a = nodes.add_node(a_id) node_b = nodes.find_node(b_id) if node_b is None: node_b = nodes.add_node(b_id) # add edge or increment weight edge = edges.find_edge(node_a.get('ID'), node_b.get('ID')) if edge is None: # add edge edge = edges.add_edge(node_a.get('ID'), node_b.get('ID')) # Replace token IDs with token text now. for node in nodes.findall(tcf.P_TEXT + 'node'): token_id = node.text token = self.corpus.find_token(token_id) node.text = token.text return graph def find_dependency_edges(self, parse, head): for dependent in parse.find_dependents(head): yield (head, dependent) for dependency_edge in self.find_dependency_edges( parse, dependent): yield dependency_edge if __name__ == '__main__': run_as_cli(ComparingWorker)
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """ JSON exporter for TCF graphs. """ import os.path from lxml import etree from tcflib import tcf from tcflib.service import ExportingWorker, run_as_cli class JSONWorker(ExportingWorker): def export(self): input_tree = self.corpus.tree xslt_file = os.path.join(os.path.dirname(__file__), 'data', 'tcf2json.xsl') xslt_tree = etree.parse(xslt_file) transform = etree.XSLT(xslt_tree) output = str(transform(input_tree)) return output if __name__ == '__main__': run_as_cli(JSONWorker)
.. _TreeTagger: http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/ """ executable = None models = {} params = ['-token', '-lemma', '-sgml', '-pt-with-lemma'] def add_annotations(self): # Add base layers model = self.models[self.corpus.lang] self.corpus.add_layer(POStags(model.tagset)) self.corpus.add_layer(Lemmas()) # tag tokens = [token.text for token in self.corpus.tokens] cmd = [self.executable] + self.params + [model.file] tagger = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE) outs, errs = tagger.communicate('\n'.join(tokens).encode('utf-8')) # TODO: Check returncode outlines = outs.splitlines() assert len(outlines) == len(self.corpus.tokens) for token, line in zip(self.corpus.tokens, outlines): _, tag, lemma = line.decode('utf-8').split('\t') token.tag = tag token.lemma = lemma if __name__ == '__main__': run_as_cli(TreeTagger)
postags = [ISOcat[postag] for postag in self.options.postags] tokenfilter = posfilter(postags) else: tokenfilter = lambda token: not token.postag.is_closed # The textstructure layer can be used like a list: if self.options.spantype: textspans = [span for span in self.corpus.textstructure if span.type == self.options.spantype] else: textspans = self.corpus.textstructure # Ensure prefix does not contain whitespace prefix = re.sub(r'\s+', '_', self.options.prefix) # Do the actual work. This mallet output uses lemma as token value. output = [] for i, span in enumerate(textspans, start=1): # Filter tokens by POS and use lemmata words = [token.lemma for token in span.tokens if tokenfilter(token)] # Deal with TreeTagger’s `<unknown>` pseudo-lemma words = [word for word in words if not word == '<unknown>'] # Append a line in mallet’s `<document> <label> <words...>` format output.append('{}{} {} {}\n'.format(prefix, i, self.corpus.lang, ' '.join(words))) # ExportingWorker returns output as bytes. return ''.join(output).encode('utf8') if __name__ == '__main__': run_as_cli(MalletWorker)
# all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. """ A to-TCF converter that wraps the SfS web service. """ from tcflib.service import RemoteWorker, run_as_cli class ToTCFConverter(RemoteWorker): __options__ = { 'informat': 'plaintext', 'outformat': 'tcf04', 'language': 'de' } url = 'http://weblicht.sfs.uni-tuebingen.de/rws/service-converter/convert/qp' if __name__ == '__main__': run_as_cli(NltkTokenizer)
columns['tokenID'] = [token.id for token in self.corpus.tokens] columns['token'] = [token.text for token in self.corpus.tokens] if hasattr(self.corpus, 'postags'): columns['POStag'] = [token.tag for token in self.corpus.tokens] if hasattr(self.corpus, 'lemmas'): columns['lemma'] = [token.lemma for token in self.corpus.tokens] if hasattr(self.corpus, 'wsd'): columns['wordsenses'] = [', '.join(token.wordsenses) for token in self.corpus.tokens] if hasattr(self.corpus, 'namedentities'): entities = [] for token in self.corpus.tokens: if not token.entity: entities.append('') elif token == token.entity.tokens[0]: entities.append('B-{}'.format(token.entity.class_)) else: entities.append('I-{}'.format(token.entity.class_)) columns['NamedEntity'] = entities # Write to CSV with StringIO(newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(list(columns.keys())) for row in zip(*columns.values()): writer.writerow(row) outstring = csvfile.getvalue() return outstring.encode('utf-8') if __name__ == '__main__': run_as_cli(CSVExporter)
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. """ GraphML exporter for TCF graphs. """ import os.path from lxml import etree from tcflib import tcf from tcflib.service import ExportingWorker, run_as_cli class GraphMLWorker(ExportingWorker): def export(self): input_tree = self.corpus.tree xslt_file = os.path.join(os.path.dirname(__file__), 'data', 'tcf2graphml.xsl') xslt_tree = etree.parse(xslt_file) transform = etree.XSLT(xslt_tree) output_tree = transform(input_tree) return etree.tostring(output_tree, encoding='utf8', pretty_print=True) if __name__ == '__main__': run_as_cli(GraphMLWorker)