Exemple #1
0
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
GraphML exporter for TCF graphs.

"""

import os.path

from lxml import etree
from tcflib import tcf
from tcflib.service import ExportingWorker, run_as_cli


class GraphMLWorker(ExportingWorker):

    def export(self):
        input_tree = self.corpus.tree
        xslt_file = os.path.join(os.path.dirname(__file__),
                                 'data', 'tcf2graphml.xsl')
        xslt_tree = etree.parse(xslt_file)
        transform = etree.XSLT(xslt_tree)
        output_tree = transform(input_tree)
        return etree.tostring(output_tree, encoding='utf8', pretty_print=True)


if __name__ == '__main__':
    run_as_cli(GraphMLWorker)
Exemple #2
0
                                              window=True)

    def build_graph_textspan_real(self, textspans, window=False):
        graph = tcf.Graph(label=self.options.label, weight=self.options.weight)
        if window:
            # Do not use textspans directly, but use windows of x textspans.
            textspans_old, textspans = list(textspans), []
            for window in self.options.window:
                for n_gram in n_grams(textspans_old, window):
                    span = tcf.TextSpan()
                    for span_old in n_gram:
                        span.tokens.extend(span_old.tokens)
                    textspans.append(span)
        n = len(textspans)
        for i, span in enumerate(textspans, start=1):
            logging.debug('Creating network for textspan {}/{}.'.format(i, n))
            tokens = set([token for token in span.tokens
                          if self.test_token(token)])
            logging.debug('Using {} tokens.'.format(len(tokens)))
            for token in tokens:
                graph.node_for_token(token)
            for combo in combinations(tokens, 2):
                try:
                    graph.edge_for_tokens(*combo, unique=self.options.unique)
                except tcf.LoopError:
                    continue
        return graph

if __name__ == '__main__':
    run_as_cli(CooccurrenceWorker)
Exemple #3
0
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""
JSON exporter for TCF graphs.

"""

import os.path

from lxml import etree
from tcflib import tcf
from tcflib.service import ExportingWorker, run_as_cli


class JSONWorker(ExportingWorker):

    def export(self):
        input_tree = self.corpus.tree
        xslt_file = os.path.join(os.path.dirname(__file__),
                                 'data', 'tcf2json.xsl')
        xslt_tree = etree.parse(xslt_file)
        transform = etree.XSLT(xslt_tree)
        output = str(transform(input_tree))
        return output


if __name__ == '__main__':
    run_as_cli(JSONWorker)
Exemple #4
0
        for dependent in self.find_dependents(parse, head):
            for dependent_edge in self.find_edges(
                    parse, dependent):
                yield dependent_edge

    def find_dependents(self, parse, head, descend=True):
        """
        Generator method that returns all filtered dependents of a given head.

        If the direct dependents of a head are filtered out and `descend` is
        True, it looks for their dependents until it finds valid ones.

        :parameters:
            - `parse`: A parse element.
            - `head`: The ID of the head element.
            - `descend`: Descend the parse tree to find valid tokens.
        :returns:
            - yields dependent's IDs.

        """
        for dependent in parse.find_dependents(head):
            if self.test_token(dependent):
                yield dependent
            elif descend:
                for dependent2 in self.find_dependents(parse, dependent):
                    yield dependent2


if __name__ == '__main__':
    run_as_cli(DependencyWorker)
Exemple #5
0
                for token in elem.tokens:
                    if token in tokens_to_keep:
                        keep = True
                        continue
                if not keep:
                    removable.append(elem)
            if isinstance(layer, AnnotationLayer):
                # List-like interface
                for elem in removable:
                    layer.remove(elem)
            elif isinstance(layer, AnnotationLayerWithIDs):
                # Dict-like interface
                for elem in removable:
                    del layer[elem.id]
        # Step 3: Remove obsolete tokens
        removable = []
        for token in self.corpus.tokens:
            if not token in tokens_to_keep:
                removable.append(token)
        for token in removable:
            del self.corpus.tokens[token.id]
        # Remove old layer
        old_layer = self.corpus._tree.find(f'//{P_TEXT}tokens')
        old_layer.getparent().remove(old_layer)
        # Add to `new_layers` to force re-serialization
        self.corpus.new_layers.insert(0, 'tokens')  # Make sure it’s the first layer


if __name__ == '__main__':
    run_as_cli(WordSampler)    
                result.append(tail)
            break
    return result


class NltkTokenizer(AddingWorker):
    def add_annotations(self):
        # Add base layers
        self.corpus.add_layer(Tokens())
        self.corpus.add_layer(Sentences())
        self.corpus.add_layer(TextStructure())
        # Parse text
        text = self.corpus.text.text
        paragraphs = listsplit(text.splitlines(), '')
        paragraphs = ['\n'.join(lines) for lines in paragraphs]
        for paragraph in paragraphs:
            textspan = TextSpan(type='paragraph')
            for sent in sent_tokenize(paragraph):
                sentence = Sentence()
                for word in word_tokenize(sent):
                    token = Token(word)
                    self.corpus.tokens.append(token)
                    sentence.tokens.append(token)
                    textspan.tokens.append(token)
                self.corpus.sentences.append(sentence)
            self.corpus.textstructure.append(textspan)


if __name__ == '__main__':
    run_as_cli(NltkTokenizer)
Exemple #7
0
        if hasattr(self.corpus, 'postags'):
            columns['POStag'] = [token.tag for token in self.corpus.tokens]
        if hasattr(self.corpus, 'lemmas'):
            columns['lemma'] = [token.lemma for token in self.corpus.tokens]
        if hasattr(self.corpus, 'wsd'):
            columns['wordsenses'] = [
                ', '.join(token.wordsenses) for token in self.corpus.tokens
            ]
        if hasattr(self.corpus, 'namedentities'):
            entities = []
            for token in self.corpus.tokens:
                if not token.entity:
                    entities.append('')
                elif token == token.entity.tokens[0]:
                    entities.append('B-{}'.format(token.entity.class_))
                else:
                    entities.append('I-{}'.format(token.entity.class_))
            columns['NamedEntity'] = entities
        # Write to CSV
        with StringIO(newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(list(columns.keys()))
            for row in zip(*columns.values()):
                writer.writerow(row)
            outstring = csvfile.getvalue()
        return outstring.encode('utf-8')


if __name__ == '__main__':
    run_as_cli(CSVExporter)
            # token ID as node label for now. We replace it with the token text
            # later.
            node_a = nodes.find_node(a_id)
            if node_a is None:
                node_a = nodes.add_node(a_id)
            node_b = nodes.find_node(b_id)
            if node_b is None:
                node_b = nodes.add_node(b_id)
            # add edge or increment weight
            edge = edges.find_edge(node_a.get('ID'), node_b.get('ID'))
            if edge is None:
                # add edge
                edge = edges.add_edge(node_a.get('ID'), node_b.get('ID'))
        # Replace token IDs with token text now.
        for node in nodes.findall(tcf.P_TEXT + 'node'):
            token_id = node.text
            token = self.corpus.find_token(token_id)
            node.text = token.text
        return graph

    def find_dependency_edges(self, parse, head):
        for dependent in parse.find_dependents(head):
            yield (head, dependent)
            for dependency_edge in self.find_dependency_edges(
                    parse, dependent):
                yield dependency_edge


if __name__ == '__main__':
    run_as_cli(ComparingWorker)
Exemple #9
0
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
JSON exporter for TCF graphs.

"""

import os.path

from lxml import etree
from tcflib import tcf
from tcflib.service import ExportingWorker, run_as_cli


class JSONWorker(ExportingWorker):
    def export(self):
        input_tree = self.corpus.tree
        xslt_file = os.path.join(os.path.dirname(__file__), 'data',
                                 'tcf2json.xsl')
        xslt_tree = etree.parse(xslt_file)
        transform = etree.XSLT(xslt_tree)
        output = str(transform(input_tree))
        return output


if __name__ == '__main__':
    run_as_cli(JSONWorker)
            # token ID as node label for now. We replace it with the token text
            # later.
            node_a = nodes.find_node(a_id)
            if node_a is None:
                node_a = nodes.add_node(a_id)
            node_b = nodes.find_node(b_id)
            if node_b is None:
                node_b = nodes.add_node(b_id)
            # add edge or increment weight
            edge = edges.find_edge(node_a.get('ID'), node_b.get('ID'))
            if edge is None:
                # add edge
                edge = edges.add_edge(node_a.get('ID'), node_b.get('ID'))
        # Replace token IDs with token text now.
        for node in nodes.findall(tcf.P_TEXT + 'node'):
            token_id = node.text
            token = self.corpus.find_token(token_id)
            node.text = token.text
        return graph

    def find_dependency_edges(self, parse, head):
        for dependent in parse.find_dependents(head):
            yield (head, dependent)
            for dependency_edge in self.find_dependency_edges(
                    parse, dependent):
                yield dependency_edge


if __name__ == '__main__':
    run_as_cli(ComparingWorker)
Exemple #11
0
    .. _TreeTagger: http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/

    """

    executable = None
    models = {}
    params = ['-token', '-lemma', '-sgml', '-pt-with-lemma']

    def add_annotations(self):
        # Add base layers
        model = self.models[self.corpus.lang]
        self.corpus.add_layer(POStags(model.tagset))
        self.corpus.add_layer(Lemmas())
        # tag
        tokens = [token.text for token in self.corpus.tokens]
        cmd = [self.executable] + self.params + [model.file]
        tagger = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
        outs, errs = tagger.communicate('\n'.join(tokens).encode('utf-8'))
        # TODO: Check returncode
        outlines = outs.splitlines()
        assert len(outlines) == len(self.corpus.tokens)
        for token, line in zip(self.corpus.tokens, outlines):
            _, tag, lemma = line.decode('utf-8').split('\t')
            token.tag = tag
            token.lemma = lemma


if __name__ == '__main__':
    run_as_cli(TreeTagger)
Exemple #12
0
                                              window=True)

    def build_graph_textspan_real(self, textspans, window=False):
        graph = tcf.Graph(label=self.options.label, weight=self.options.weight)
        if window:
            # Do not use textspans directly, but use windows of x textspans.
            textspans_old, textspans = list(textspans), []
            for window in self.options.window:
                for n_gram in n_grams(textspans_old, window):
                    span = tcf.TextSpan()
                    for span_old in n_gram:
                        span.tokens.extend(span_old.tokens)
                    textspans.append(span)
        n = len(textspans)
        for i, span in enumerate(textspans, start=1):
            logging.debug('Creating network for textspan {}/{}.'.format(i, n))
            tokens = set([token for token in span.tokens
                          if self.test_token(token)])
            logging.debug('Using {} tokens.'.format(len(tokens)))
            for token in tokens:
                graph.node_for_token(token)
            for combo in combinations(tokens, 2):
                try:
                    graph.edge_for_tokens(*combo, unique=self.options.unique)
                except tcf.LoopError:
                    continue
        return graph

if __name__ == '__main__':
    run_as_cli(CooccurrenceWorker)
            postags = [ISOcat[postag] for postag in self.options.postags]
            tokenfilter = posfilter(postags)
        else:
            tokenfilter = lambda token: not token.postag.is_closed
        # The textstructure layer can be used like a list:
        if self.options.spantype:
            textspans = [span for span in self.corpus.textstructure
                         if span.type == self.options.spantype]
        else:
            textspans = self.corpus.textstructure
        # Ensure prefix does not contain whitespace
        prefix = re.sub(r'\s+', '_', self.options.prefix)
        # Do the actual work. This mallet output uses lemma as token value.
        output = []
        for i, span in enumerate(textspans, start=1):
            # Filter tokens by POS and use lemmata
            words = [token.lemma for token in span.tokens
                     if tokenfilter(token)]
            # Deal with TreeTagger’s `<unknown>` pseudo-lemma
            words = [word for word in words if not word == '<unknown>']
            # Append a line in mallet’s `<document> <label> <words...>` format
            output.append('{}{} {} {}\n'.format(prefix, i,
                                                self.corpus.lang,
                                                ' '.join(words)))
        # ExportingWorker returns output as bytes.
        return ''.join(output).encode('utf8')


if __name__ == '__main__':
    run_as_cli(MalletWorker)
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

"""
A to-TCF converter that wraps the SfS web service.

"""

from tcflib.service import RemoteWorker, run_as_cli


class ToTCFConverter(RemoteWorker):

    __options__ = {
        'informat': 'plaintext',
        'outformat': 'tcf04',
        'language': 'de'
    }
    url = 'http://weblicht.sfs.uni-tuebingen.de/rws/service-converter/convert/qp'


if __name__ == '__main__':
    run_as_cli(NltkTokenizer)    
Exemple #15
0
        columns['tokenID'] = [token.id for token in self.corpus.tokens]
        columns['token'] = [token.text for token in self.corpus.tokens]
        if hasattr(self.corpus, 'postags'):
            columns['POStag'] = [token.tag for token in self.corpus.tokens]
        if hasattr(self.corpus, 'lemmas'):
            columns['lemma'] = [token.lemma for token in self.corpus.tokens]
        if hasattr(self.corpus, 'wsd'):
            columns['wordsenses'] = [', '.join(token.wordsenses)
                                     for token in self.corpus.tokens]
        if hasattr(self.corpus, 'namedentities'):
            entities = []
            for token in self.corpus.tokens:
                if not token.entity:
                    entities.append('')
                elif token == token.entity.tokens[0]:
                    entities.append('B-{}'.format(token.entity.class_))
                else:
                    entities.append('I-{}'.format(token.entity.class_))
            columns['NamedEntity'] = entities
        # Write to CSV
        with StringIO(newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(list(columns.keys()))
            for row in zip(*columns.values()):
                writer.writerow(row)
            outstring = csvfile.getvalue()
        return outstring.encode('utf-8')

if __name__ == '__main__':
    run_as_cli(CSVExporter)
Exemple #16
0
    .. _TreeTagger: http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/

    """

    executable = None
    models = {}
    params = ['-token', '-lemma', '-sgml', '-pt-with-lemma']

    def add_annotations(self):
        # Add base layers
        model = self.models[self.corpus.lang]
        self.corpus.add_layer(POStags(model.tagset))
        self.corpus.add_layer(Lemmas())
        # tag
        tokens = [token.text for token in self.corpus.tokens]
        cmd = [self.executable] + self.params + [model.file]
        tagger = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE)
        outs, errs = tagger.communicate('\n'.join(tokens).encode('utf-8'))
        # TODO: Check returncode
        outlines = outs.splitlines()
        assert len(outlines) == len(self.corpus.tokens)
        for token, line in zip(self.corpus.tokens, outlines):
            _, tag, lemma = line.decode('utf-8').split('\t')
            token.tag = tag
            token.lemma = lemma


if __name__ == '__main__':
    run_as_cli(TreeTagger)
Exemple #17
0
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
GraphML exporter for TCF graphs.

"""

import os.path

from lxml import etree
from tcflib import tcf
from tcflib.service import ExportingWorker, run_as_cli


class GraphMLWorker(ExportingWorker):
    def export(self):
        input_tree = self.corpus.tree
        xslt_file = os.path.join(os.path.dirname(__file__), 'data',
                                 'tcf2graphml.xsl')
        xslt_tree = etree.parse(xslt_file)
        transform = etree.XSLT(xslt_tree)
        output_tree = transform(input_tree)
        return etree.tostring(output_tree, encoding='utf8', pretty_print=True)


if __name__ == '__main__':
    run_as_cli(GraphMLWorker)