コード例 #1
0
def main():
    parser = argparse.ArgumentParser(description="""\
    Creates tag statistics.
    """)
    parser.add_argument("-I", "--input", required=True, help="input file")
    parser.add_argument("-O", "--output", required=True, help="output file")
    parser.add_argument("-L", "--lexicon", required=True, help="lexicon file")
    parser.add_argument("-M", "--max", help="maximum output")
    args = parser.parse_args()

    if args.input and args.output and args.lexicon:
        lexicon = json.load(open(args.lexicon))
        with codecs.open(args.output, "w", "utf-8") as out:
            wt = defaultdict(set)
            wc = Counter()
            wtc = Counter()

            for sentence in codecs.open(args.input, "r", "utf-8"):
                tokens = [str2tuple(token) for token in sentence.split()]

                for word, tag in tokens:
                    wt[word].add(tag)
                    wc[word] += 1
                    wtc[tuple2str((word, tag))] += 1

            r = {"Count": [],
                 "Words": [],
                 "Found": [],
                 "Lexicon": []}

            if args.max:
                max_num = int(args.max)
            else:
                max_num = None

            for word, count in wc.most_common(max_num):
                r["Words"].append(word)
                r["Count"].append(count)
                tg = set()

                for tag in wt[word]:
                    t = tuple2str((word, tag))
                    in_lex = ""
                    if lexicon.get(word.lower()):
                        if tag not in lexicon.get(word.lower()):
                            in_lex = "*"
                    tg.add((tag + in_lex, wtc[t]))

                tg = sorted(tg, key=lambda k: k[1], reverse=True)
                r["Found"].append(", ".join([u"{0} ({1})".format(x, y)
                                                       for x, y in tg]))
                if lexicon.get(word.lower()):
                    r["Lexicon"].append(", ".join(lexicon.get(word.lower())))
                else:
                    r["Lexicon"].append("")
            out.write(u"{0}".format(tabulate(r,
                                             headers="keys",
                                             tablefmt="pipe")))
    else:
        print parser.print_help()
コード例 #2
0
ファイル: featureshasing.py プロジェクト: mishadev/stuff
 def features_to_words(self, features):
     spliter_re = re.compile("\s")
     words = set([word for feature in features for word in spliter_re.split(feature)])
     if self.use_pos_tag:
         words = map(lambda word: str2tuple(word)[0], words)
     words = filter(lambda word: word, words)
     return words
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(description="""\
    Creates tag statistics.
    """)
    parser.add_argument("-I", "--input", required=True, help="input file")
    parser.add_argument("-O", "--output", required=True, help="output file")
    parser.add_argument("-L", "--lexicon", required=True, help="lexicon file")
    parser.add_argument("-M", "--max", help="maximum output")
    args = parser.parse_args()

    if args.input and args.output and args.lexicon:
        lexicon = json.load(open(args.lexicon))
        with codecs.open(args.output, "w", "utf-8") as out:
            wt = defaultdict(set)
            wc = Counter()
            wtc = Counter()

            for sentence in codecs.open(args.input, "r", "utf-8"):
                tokens = [str2tuple(token) for token in sentence.split()]

                for word, tag in tokens:
                    wt[word].add(tag)
                    wc[word] += 1
                    wtc[tuple2str((word, tag))] += 1

            r = {"Count": [], "Words": [], "Found": [], "Lexicon": []}

            if args.max:
                max_num = int(args.max)
            else:
                max_num = None

            for word, count in wc.most_common(max_num):
                r["Words"].append(word)
                r["Count"].append(count)
                tg = set()

                for tag in wt[word]:
                    t = tuple2str((word, tag))
                    in_lex = ""
                    if lexicon.get(word.lower()):
                        if tag not in lexicon.get(word.lower()):
                            in_lex = "*"
                    tg.add((tag + in_lex, wtc[t]))

                tg = sorted(tg, key=lambda k: k[1], reverse=True)
                r["Found"].append(", ".join(
                    [u"{0} ({1})".format(x, y) for x, y in tg]))
                if lexicon.get(word.lower()):
                    r["Lexicon"].append(", ".join(lexicon.get(word.lower())))
                else:
                    r["Lexicon"].append("")
            out.write(u"{0}".format(
                tabulate(r, headers="keys", tablefmt="pipe")))
    else:
        print parser.print_help()
コード例 #4
0
def read_training_data(training_file):
    """
    Extracts part-of-speech (POS) tag, transition between tags, and emission counts from a tagged training corpus.

    The POS tag count keeps track of the number of times a given POS tag occurs in the training data.
    This is stored in a dictionary with POS tag keys and integer count values.

    The transition counts keep track of how often the first tag is followed by a second tag.
    This is stored in a dictionary with tuple(tag1, tag2) keys and the number of times tag2 is followed by tag1 values.

    The emission count keeps track of the number of times a word and its associated tag occurs in the data.
    This is stored in a dictionary with tuple(word, POS tag) keys and integer count values.

    The training file is expected to be a training set of POS-tagged sentences, separated by newline characters.
    Additional custom tags, "START" and "END", are included to indicate the start and end of each sentence.

    :param training_file: the location of the training file
    :return: a tuple of dictionaries tracking tag counts, transition counts, and emission counts
    """
    tag_types = list(load('help/tagsets/upenn_tagset.pickle').keys()) + [
        "START", "END", "-LRB-", "-RRB-", "#"
    ]
    tag_types = [x for x in tag_types if x not in ["(", ")", "--"]
                 ]  # The tagset in nltk uses different notations
    tag_type_permutations = list(product(tag_types, repeat=2))

    tag_counts = dict.fromkeys(tag_types, 0)
    transition_counts = dict.fromkeys(tag_type_permutations, 0)
    emission_counts = {}

    with open(training_file, "r") as training_data:
        for line in tqdm(training_data,
                         total=rawcount(training_file),
                         desc="Training"):

            tagged_tokens = tuple(
                str2tuple(tagged_token) for tagged_token in line.split())
            tag_sequence = ("START", ) + tuple(
                tagged_token[1] for tagged_token in tagged_tokens) + ("END", )

            for tag in tag_sequence:
                tag_counts[tag] += 1

            for tag_pair in pairwise(tag_sequence):
                transition_counts[tag_pair] += 1

            for tagged_token in tagged_tokens:
                if tagged_token in emission_counts:
                    emission_counts[tagged_token] += 1
                else:
                    emission_counts[tagged_token] = 1

    return tag_counts, transition_counts, emission_counts
コード例 #5
0
ファイル: indian.py プロジェクト: B-Rich/Fem-Coding-Challenge
 def read_block(self, stream):
     line = stream.readline()
     if line.startswith('<'):
         return []
     sent = [str2tuple(word, sep='_') for word in line.split()]
     if self._tag_mapping_function:
         sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
     if not self._tagged: sent = [w for (w,t) in sent]
     if self._group_by_sent:
         return [sent]
     else:
         return sent
コード例 #6
0
 def read_block(self, stream):
     line = stream.readline()
     if line.startswith('<'):
         return []
     sent = [str2tuple(word, sep='_') for word in line.split()]
     if self._tag_mapping_function:
         sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
     if not self._tagged: sent = [w for (w,t) in sent]
     if self._group_by_sent:
         return [sent]
     else:
         return sent
コード例 #7
0
ファイル: util.py プロジェクト: Drafffffff/nltk-1
def tagstr2tree(s,
                chunk_label="NP",
                root_label="S",
                sep="/",
                source_tagset=None,
                target_tagset=None):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+")

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == "[":
            if len(stack) != 1:
                raise ValueError("Unexpected [ at char {:d}".format(
                    match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == "]":
            if len(stack) != 2:
                raise ValueError("Unexpected ] at char {:d}".format(
                    match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError("Expected ] at char {:d}".format(len(s)))
    return stack[0]
コード例 #8
0
def main():
    parser = argparse.ArgumentParser(description="""\
    Convert pos tagged file to whitespace tokenized file.
    """)
    parser.add_argument("-I", "--input", required=True, help="input file")
    parser.add_argument("-O", "--output", required=True, help="output file")
    args = parser.parse_args()

    if args.input and args.output:
        with codecs.open(args.output, "w", "utf-8") as out:
            for sentence in codecs.open(args.input, "r", "utf-8"):
                tokens = [str2tuple(token) for token in sentence.split()]
                out.write(u"{0}\n".format(" ".join(
                    [word for word, tag in tokens])))
    else:
        print parser.print_help()
コード例 #9
0
def main():
    parser = argparse.ArgumentParser(description="""\
    Convert pos tagged file to whitespace tokenized file.
    """)
    parser.add_argument("-I", "--input", required=True, help="input file")
    parser.add_argument("-O", "--output", required=True, help="output file")
    args = parser.parse_args()

    if args.input and args.output:
        with codecs.open(args.output, "w", "utf-8") as out:
            for sentence in codecs.open(args.input, "r", "utf-8"):
                tokens = [str2tuple(token) for token in sentence.split()]
                out.write(u"{0}\n".format(" ".join([word for word, tag
                                                    in tokens])))
    else:
        print parser.print_help()
コード例 #10
0
ファイル: util.py プロジェクト: prz3m/kind2anki
def tagstr2tree(
    s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char {:d}'.format(len(s)))
    return stack[0]
コード例 #11
0
ファイル: process.py プロジェクト: innerfirexy/hcrc-map-task
def parse_worker(args):
    datum, parser, queue = args
    obsv, uid, tagged_str = datum
    # parse
    if tagged_str == '':
        return (obsv, uid, '')
    else:
        tagged = [str2tuple(t, sep = '/') for t in tagged_str.split()]
        try:
            tree = list(parser.tagged_parse(tagged))
        except Exception as e:
            print('observation: {}, utterID: {}, sentence: {}'.format(obsv, uid, tagged_str))
            raise e
        else:
            tree_str = str(tree[0]).replace('\n', '')
            queue.put(1)
            return (obsv, uid, tree_str)
コード例 #12
0
def parse_worker(args):
    datum, parser, queue = args
    obsv, uid, tagged_str = datum
    # parse
    if tagged_str == '':
        return (obsv, uid, '')
    else:
        tagged = [str2tuple(t, sep='/') for t in tagged_str.split()]
        try:
            tree = list(parser.tagged_parse(tagged))
        except Exception as e:
            print('observation: {}, utterID: {}, sentence: {}'.format(
                obsv, uid, tagged_str))
            raise e
        else:
            tree_str = str(tree[0]).replace('\n', '')
            queue.put(1)
            return (obsv, uid, tree_str)
コード例 #13
0
ファイル: superchunk_reader.py プロジェクト: urtonj/PA4
def superchunk2tree(s, chunk_node="NP", top_node="S", sep='/'):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a C{Tree}.
    Chunks are marked by square brackets (C{[...]}).  Words are
    delimited by whitespace, and each word should have the form
    C{I{text}/I{tag}}.  Words that do not contain a slash are
    assigned a C{tag} of C{None}.

    @return: A tree corresponding to the string representation.
    @rtype: C{tree}
    @param s: The string to be converted
    @type s: C{string}
    @param chunk_node: The label to use for chunk nodes
    @type chunk_node: C{string}
    @param top_node: The label to use for the root of the tree
    @type top_node: C{string}
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(top_node, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            chunk = Tree(chunk_node, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                t = str2tuple(text, sep)
                if t[1] is None:
                    # Chunk label.
                    stack[-1].node = t[0]
                else:
                    stack[-1].append(t)

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]
コード例 #14
0
def superchunk2tree(s, chunk_node="NP", top_node="S", sep='/'):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a C{Tree}.
    Chunks are marked by square brackets (C{[...]}).  Words are
    delimited by whitespace, and each word should have the form
    C{I{text}/I{tag}}.  Words that do not contain a slash are
    assigned a C{tag} of C{None}.

    @return: A tree corresponding to the string representation.
    @rtype: C{tree}
    @param s: The string to be converted
    @type s: C{string}
    @param chunk_node: The label to use for chunk nodes
    @type chunk_node: C{string}
    @param top_node: The label to use for the root of the tree
    @type top_node: C{string}
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(top_node, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            chunk = Tree(chunk_node, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                t = str2tuple(text, sep)
                if t[1] is None:
                    # Chunk label.
                    stack[-1].node = t[0]
                else:
                    stack[-1].append(t)

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]
コード例 #15
0
def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_node: The label to use for chunk nodes
    :type chunk_node: str
    :param top_node: The label to use for the root of the tree
    :type top_node: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(top_node, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char %d' % match.start())
            chunk = Tree(chunk_node, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char %d' % match.start())
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                stack[-1].append(str2tuple(text, sep))

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]
コード例 #16
0
ファイル: chunker.py プロジェクト: kevinmel2000/sontekan
def main(corpus_file, output):
    with open(output, 'w') as out:
        for line in open(corpus_file):
            try:
                line = ftfy.fix_text(line.decode('utf-8'))
            except Exception, e:
                print e

            tokens = [str2tuple(tok) for tok in re.sub('\s+', ' ', line).split()]
            try:
                tree = chunker.parse(tokens)
            except Exception, e:
                print e
            for subtree in tree.subtrees(filter = lambda t: t.label() in ['NP', 'VP']):
                try:
                    text = [w.strip(punct) for (w, t) in subtree.leaves() if t != '.']
                    text = ' '.join(text).strip().lower()
                    if len(text) > 2 and not text.isdigit() and is_ascii(text):
                        out.write(text)
                        out.write('\n')
                except Exception, e:
                    print e
コード例 #17
0
def tokenizacion(archivo):
    # Tokens
    palabras = nltk.word_tokenize(archivo.read())

    palabraSucias = []

    for palabra in palabras:
        palabraSucias.append(str2tuple(palabra))
        # print(palabraSucias)

    textoLimpio = untag(palabraSucias)

    # Borro los signos de puntuacion
    textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1]
    #
    # for texto in textoLimpio:
    #     if (texto == '``'):
    #         textoLimpio=textoLimpio.remove(texto)

    for palabra in textoLimpio:
        palabra.lower()
    return textoLimpio
コード例 #18
0
 def tagTokenExtractor(self, wordtaggedlist):
     for z in wordtaggedlist:
         taggedtoken = util.str2tuple('/'.join(z))
         print taggedtoken[1]
コード例 #19
0
ファイル: tagger.py プロジェクト: ssingh68/POS-Tagging-NLP
#Reading the pos-test-with-tags.txt file and perform splitting
trainfile = file_train.read()
train_list = trainfile.split()

#Removing square brackets
if '[' in train_list:
    train_list = list(filter(('[').__ne__, train_list))
if ']' in train_list:
    train_list = list(filter((']').__ne__, train_list))

#Initializing list, creating tuple from string and appending in a
#Dictionaries are created for the training dataset in order to ease data manipulation.
a = []
for i in train_list:
    if "|" in str2tuple(i)[1]:
        temp = str2tuple(i)
        a.append((temp[0], temp[1].split("|")[0]))
    else:
        a.append(str2tuple(i))
        dict(a)

#Looking out for the most frequent tag and assigning it to the word
cfd = nltk.ConditionalFreqDist(a[:])
likely_tag = dict((word, cfd[word].max()) for word in dict(a))

testfile = dict()

##Reading the pos-test.txt file and perform splitting
testfile = file_test.read().split()
コード例 #20
0
# Created at UC Berkeley 2015
# Authors: Christopher Hench
# ==============================================================================
'''This code presents summary statistics for MHG syllables used for scansion
based on the paper presented at the NAACL-CLFL 2016 by Christopher Hench and
Alex Estes.'''

import pandas as pd
from nltk.tag.util import str2tuple

with open("Data/CLFL_all_data.txt", "r", encoding="utf-8") as f:
    data = f.read()

lines = data.split('\n')
tags = [[str2tuple(x) for x in line.split()] for line in lines]
tags = [[x[0] for x in line] for line in tags]

all_lines = []
all_sylls = []
for line in tags:
    newline = []
    word = ""
    l_syllables = 0
    s_line = []
    s_word = []
    for syll in line:
        if syll == "WBY":
            newline.append(word)
            s_line.append(s_word)
            word = ""
コード例 #21
0
testkey=sys.argv[2]

#Opening both files
file_testwithtags = open(testtagged,'r')
file_key = open(testkey,'r')

#file_testwithtags = open(r"C:\Users\shrey\Desktop\pos-test-with-tags.txt")

#Reading the pos-test-with-tags.txt file and perform splitting
taggedtestfile = file_testwithtags.read()
taggedtest = taggedtestfile.split()

#Initializing list, creating tuple from string and appending in a
a=[]
for i in taggedtest:
    a.append(str2tuple(i))

#file_key = open(r"C:/Users/shrey/Desktop/George Mason University/Sem 2/AIT 690/Assignment/PA2/pos-test-key.txt")

##Reading the pos-test-key.txt file and perform splitting
testfile = file_key.read().split()

#Removing square brackets
if '[' in testfile:
    testfile = list(filter(('[').__ne__, testfile))   
if ']' in testfile:
    testfile = list(filter((']').__ne__, testfile))

#Initializing list, creating tuple from string and appending in b
b=[]
for j in testfile:
コード例 #22
0
ファイル: chunk.py プロジェクト: detik19/BimaNLP
 def tagTokenExtractor(self, wordtaggedlist):
     for z in wordtaggedlist:
         taggedtoken = util.str2tuple('/'.join(z))
         print taggedtoken[1]
コード例 #23
0
 def transform(self, item):
     item = item.encode('ascii','ignore')
     return str2tuple(item, "/")
コード例 #24
0
import nltk
from nltk.tag.util import str2tuple
from nltk.tag.util import untag
from nltk.tag.util import tuple2str

textoSucio = 'It/pps recommended/vbd that/cs Fulton/np legislators/nns act/vb ``/`` to/to have/hv these/dts laws/nns studied/vbn and/cc revised/vbn to/in the/at end/nn of/in modernizing/vbg and/cc improving/vbg them/ppo ' '/' ' ./.'
palabras = nltk.word_tokenize(textoSucio)
palabraSucias = []

for palabra in palabras:
    palabraSucias.append(str2tuple(palabra))
    # print(palabraSucias)

textoLimpio = untag(palabraSucias)

textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1]

for texto in textoLimpio:
    if (texto == '``'):
        textoLimpio.remove(texto)

# stopwords = set(nltk.corpus.stopwords.words('english'))  # StopWords Configuracion
# textoLimpio = [palabra for palabra in textoLimpio if palabra not in stopwords]

print(textoLimpio)
コード例 #25
0
 def as_tuples(line):
     return [str2tuple(token, sep='_') for token in line.split(' ')]