def main(): parser = argparse.ArgumentParser(description="""\ Creates tag statistics. """) parser.add_argument("-I", "--input", required=True, help="input file") parser.add_argument("-O", "--output", required=True, help="output file") parser.add_argument("-L", "--lexicon", required=True, help="lexicon file") parser.add_argument("-M", "--max", help="maximum output") args = parser.parse_args() if args.input and args.output and args.lexicon: lexicon = json.load(open(args.lexicon)) with codecs.open(args.output, "w", "utf-8") as out: wt = defaultdict(set) wc = Counter() wtc = Counter() for sentence in codecs.open(args.input, "r", "utf-8"): tokens = [str2tuple(token) for token in sentence.split()] for word, tag in tokens: wt[word].add(tag) wc[word] += 1 wtc[tuple2str((word, tag))] += 1 r = {"Count": [], "Words": [], "Found": [], "Lexicon": []} if args.max: max_num = int(args.max) else: max_num = None for word, count in wc.most_common(max_num): r["Words"].append(word) r["Count"].append(count) tg = set() for tag in wt[word]: t = tuple2str((word, tag)) in_lex = "" if lexicon.get(word.lower()): if tag not in lexicon.get(word.lower()): in_lex = "*" tg.add((tag + in_lex, wtc[t])) tg = sorted(tg, key=lambda k: k[1], reverse=True) r["Found"].append(", ".join([u"{0} ({1})".format(x, y) for x, y in tg])) if lexicon.get(word.lower()): r["Lexicon"].append(", ".join(lexicon.get(word.lower()))) else: r["Lexicon"].append("") out.write(u"{0}".format(tabulate(r, headers="keys", tablefmt="pipe"))) else: print parser.print_help()
def features_to_words(self, features): spliter_re = re.compile("\s") words = set([word for feature in features for word in spliter_re.split(feature)]) if self.use_pos_tag: words = map(lambda word: str2tuple(word)[0], words) words = filter(lambda word: word, words) return words
def main(): parser = argparse.ArgumentParser(description="""\ Creates tag statistics. """) parser.add_argument("-I", "--input", required=True, help="input file") parser.add_argument("-O", "--output", required=True, help="output file") parser.add_argument("-L", "--lexicon", required=True, help="lexicon file") parser.add_argument("-M", "--max", help="maximum output") args = parser.parse_args() if args.input and args.output and args.lexicon: lexicon = json.load(open(args.lexicon)) with codecs.open(args.output, "w", "utf-8") as out: wt = defaultdict(set) wc = Counter() wtc = Counter() for sentence in codecs.open(args.input, "r", "utf-8"): tokens = [str2tuple(token) for token in sentence.split()] for word, tag in tokens: wt[word].add(tag) wc[word] += 1 wtc[tuple2str((word, tag))] += 1 r = {"Count": [], "Words": [], "Found": [], "Lexicon": []} if args.max: max_num = int(args.max) else: max_num = None for word, count in wc.most_common(max_num): r["Words"].append(word) r["Count"].append(count) tg = set() for tag in wt[word]: t = tuple2str((word, tag)) in_lex = "" if lexicon.get(word.lower()): if tag not in lexicon.get(word.lower()): in_lex = "*" tg.add((tag + in_lex, wtc[t])) tg = sorted(tg, key=lambda k: k[1], reverse=True) r["Found"].append(", ".join( [u"{0} ({1})".format(x, y) for x, y in tg])) if lexicon.get(word.lower()): r["Lexicon"].append(", ".join(lexicon.get(word.lower()))) else: r["Lexicon"].append("") out.write(u"{0}".format( tabulate(r, headers="keys", tablefmt="pipe"))) else: print parser.print_help()
def read_training_data(training_file): """ Extracts part-of-speech (POS) tag, transition between tags, and emission counts from a tagged training corpus. The POS tag count keeps track of the number of times a given POS tag occurs in the training data. This is stored in a dictionary with POS tag keys and integer count values. The transition counts keep track of how often the first tag is followed by a second tag. This is stored in a dictionary with tuple(tag1, tag2) keys and the number of times tag2 is followed by tag1 values. The emission count keeps track of the number of times a word and its associated tag occurs in the data. This is stored in a dictionary with tuple(word, POS tag) keys and integer count values. The training file is expected to be a training set of POS-tagged sentences, separated by newline characters. Additional custom tags, "START" and "END", are included to indicate the start and end of each sentence. :param training_file: the location of the training file :return: a tuple of dictionaries tracking tag counts, transition counts, and emission counts """ tag_types = list(load('help/tagsets/upenn_tagset.pickle').keys()) + [ "START", "END", "-LRB-", "-RRB-", "#" ] tag_types = [x for x in tag_types if x not in ["(", ")", "--"] ] # The tagset in nltk uses different notations tag_type_permutations = list(product(tag_types, repeat=2)) tag_counts = dict.fromkeys(tag_types, 0) transition_counts = dict.fromkeys(tag_type_permutations, 0) emission_counts = {} with open(training_file, "r") as training_data: for line in tqdm(training_data, total=rawcount(training_file), desc="Training"): tagged_tokens = tuple( str2tuple(tagged_token) for tagged_token in line.split()) tag_sequence = ("START", ) + tuple( tagged_token[1] for tagged_token in tagged_tokens) + ("END", ) for tag in tag_sequence: tag_counts[tag] += 1 for tag_pair in pairwise(tag_sequence): transition_counts[tag_pair] += 1 for tagged_token in tagged_tokens: if tagged_token in emission_counts: emission_counts[tagged_token] += 1 else: emission_counts[tagged_token] = 1 return tag_counts, transition_counts, emission_counts
def read_block(self, stream): line = stream.readline() if line.startswith('<'): return [] sent = [str2tuple(word, sep='_') for word in line.split()] if self._tag_mapping_function: sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: return [sent] else: return sent
def tagstr2tree(s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+") stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == "[": if len(stack) != 1: raise ValueError("Unexpected [ at char {:d}".format( match.start())) chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == "]": if len(stack) != 2: raise ValueError("Unexpected ] at char {:d}".format( match.start())) stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError("Expected ] at char {:d}".format(len(s))) return stack[0]
def main(): parser = argparse.ArgumentParser(description="""\ Convert pos tagged file to whitespace tokenized file. """) parser.add_argument("-I", "--input", required=True, help="input file") parser.add_argument("-O", "--output", required=True, help="output file") args = parser.parse_args() if args.input and args.output: with codecs.open(args.output, "w", "utf-8") as out: for sentence in codecs.open(args.input, "r", "utf-8"): tokens = [str2tuple(token) for token in sentence.split()] out.write(u"{0}\n".format(" ".join( [word for word, tag in tokens]))) else: print parser.print_help()
def main(): parser = argparse.ArgumentParser(description="""\ Convert pos tagged file to whitespace tokenized file. """) parser.add_argument("-I", "--input", required=True, help="input file") parser.add_argument("-O", "--output", required=True, help="output file") args = parser.parse_args() if args.input and args.output: with codecs.open(args.output, "w", "utf-8") as out: for sentence in codecs.open(args.input, "r", "utf-8"): tokens = [str2tuple(token) for token in sentence.split()] out.write(u"{0}\n".format(" ".join([word for word, tag in tokens]))) else: print parser.print_help()
def tagstr2tree( s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None ): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == '[': if len(stack) != 1: raise ValueError('Unexpected [ at char {:d}'.format(match.start())) chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == ']': if len(stack) != 2: raise ValueError('Unexpected ] at char {:d}'.format(match.start())) stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError('Expected ] at char {:d}'.format(len(s))) return stack[0]
def parse_worker(args): datum, parser, queue = args obsv, uid, tagged_str = datum # parse if tagged_str == '': return (obsv, uid, '') else: tagged = [str2tuple(t, sep = '/') for t in tagged_str.split()] try: tree = list(parser.tagged_parse(tagged)) except Exception as e: print('observation: {}, utterID: {}, sentence: {}'.format(obsv, uid, tagged_str)) raise e else: tree_str = str(tree[0]).replace('\n', '') queue.put(1) return (obsv, uid, tree_str)
def parse_worker(args): datum, parser, queue = args obsv, uid, tagged_str = datum # parse if tagged_str == '': return (obsv, uid, '') else: tagged = [str2tuple(t, sep='/') for t in tagged_str.split()] try: tree = list(parser.tagged_parse(tagged)) except Exception as e: print('observation: {}, utterID: {}, sentence: {}'.format( obsv, uid, tagged_str)) raise e else: tree_str = str(tree[0]).replace('\n', '') queue.put(1) return (obsv, uid, tree_str)
def superchunk2tree(s, chunk_node="NP", top_node="S", sep='/'): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a C{Tree}. Chunks are marked by square brackets (C{[...]}). Words are delimited by whitespace, and each word should have the form C{I{text}/I{tag}}. Words that do not contain a slash are assigned a C{tag} of C{None}. @return: A tree corresponding to the string representation. @rtype: C{tree} @param s: The string to be converted @type s: C{string} @param chunk_node: The label to use for chunk nodes @type chunk_node: C{string} @param top_node: The label to use for the root of the tree @type top_node: C{string} """ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') stack = [Tree(top_node, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == '[': chunk = Tree(chunk_node, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == ']': stack.pop() else: if sep is None: stack[-1].append(text) else: t = str2tuple(text, sep) if t[1] is None: # Chunk label. stack[-1].node = t[0] else: stack[-1].append(t) if len(stack) != 1: raise ValueError('Expected ] at char %d' % len(s)) return stack[0]
def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_node: The label to use for chunk nodes :type chunk_node: str :param top_node: The label to use for the root of the tree :type top_node: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') stack = [Tree(top_node, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == '[': if len(stack) != 1: raise ValueError('Unexpected [ at char %d' % match.start()) chunk = Tree(chunk_node, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == ']': if len(stack) != 2: raise ValueError('Unexpected ] at char %d' % match.start()) stack.pop() else: if sep is None: stack[-1].append(text) else: stack[-1].append(str2tuple(text, sep)) if len(stack) != 1: raise ValueError('Expected ] at char %d' % len(s)) return stack[0]
def main(corpus_file, output): with open(output, 'w') as out: for line in open(corpus_file): try: line = ftfy.fix_text(line.decode('utf-8')) except Exception, e: print e tokens = [str2tuple(tok) for tok in re.sub('\s+', ' ', line).split()] try: tree = chunker.parse(tokens) except Exception, e: print e for subtree in tree.subtrees(filter = lambda t: t.label() in ['NP', 'VP']): try: text = [w.strip(punct) for (w, t) in subtree.leaves() if t != '.'] text = ' '.join(text).strip().lower() if len(text) > 2 and not text.isdigit() and is_ascii(text): out.write(text) out.write('\n') except Exception, e: print e
def tokenizacion(archivo): # Tokens palabras = nltk.word_tokenize(archivo.read()) palabraSucias = [] for palabra in palabras: palabraSucias.append(str2tuple(palabra)) # print(palabraSucias) textoLimpio = untag(palabraSucias) # Borro los signos de puntuacion textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1] # # for texto in textoLimpio: # if (texto == '``'): # textoLimpio=textoLimpio.remove(texto) for palabra in textoLimpio: palabra.lower() return textoLimpio
def tagTokenExtractor(self, wordtaggedlist): for z in wordtaggedlist: taggedtoken = util.str2tuple('/'.join(z)) print taggedtoken[1]
#Reading the pos-test-with-tags.txt file and perform splitting trainfile = file_train.read() train_list = trainfile.split() #Removing square brackets if '[' in train_list: train_list = list(filter(('[').__ne__, train_list)) if ']' in train_list: train_list = list(filter((']').__ne__, train_list)) #Initializing list, creating tuple from string and appending in a #Dictionaries are created for the training dataset in order to ease data manipulation. a = [] for i in train_list: if "|" in str2tuple(i)[1]: temp = str2tuple(i) a.append((temp[0], temp[1].split("|")[0])) else: a.append(str2tuple(i)) dict(a) #Looking out for the most frequent tag and assigning it to the word cfd = nltk.ConditionalFreqDist(a[:]) likely_tag = dict((word, cfd[word].max()) for word in dict(a)) testfile = dict() ##Reading the pos-test.txt file and perform splitting testfile = file_test.read().split()
# Created at UC Berkeley 2015 # Authors: Christopher Hench # ============================================================================== '''This code presents summary statistics for MHG syllables used for scansion based on the paper presented at the NAACL-CLFL 2016 by Christopher Hench and Alex Estes.''' import pandas as pd from nltk.tag.util import str2tuple with open("Data/CLFL_all_data.txt", "r", encoding="utf-8") as f: data = f.read() lines = data.split('\n') tags = [[str2tuple(x) for x in line.split()] for line in lines] tags = [[x[0] for x in line] for line in tags] all_lines = [] all_sylls = [] for line in tags: newline = [] word = "" l_syllables = 0 s_line = [] s_word = [] for syll in line: if syll == "WBY": newline.append(word) s_line.append(s_word) word = ""
testkey=sys.argv[2] #Opening both files file_testwithtags = open(testtagged,'r') file_key = open(testkey,'r') #file_testwithtags = open(r"C:\Users\shrey\Desktop\pos-test-with-tags.txt") #Reading the pos-test-with-tags.txt file and perform splitting taggedtestfile = file_testwithtags.read() taggedtest = taggedtestfile.split() #Initializing list, creating tuple from string and appending in a a=[] for i in taggedtest: a.append(str2tuple(i)) #file_key = open(r"C:/Users/shrey/Desktop/George Mason University/Sem 2/AIT 690/Assignment/PA2/pos-test-key.txt") ##Reading the pos-test-key.txt file and perform splitting testfile = file_key.read().split() #Removing square brackets if '[' in testfile: testfile = list(filter(('[').__ne__, testfile)) if ']' in testfile: testfile = list(filter((']').__ne__, testfile)) #Initializing list, creating tuple from string and appending in b b=[] for j in testfile:
def transform(self, item): item = item.encode('ascii','ignore') return str2tuple(item, "/")
import nltk from nltk.tag.util import str2tuple from nltk.tag.util import untag from nltk.tag.util import tuple2str textoSucio = 'It/pps recommended/vbd that/cs Fulton/np legislators/nns act/vb ``/`` to/to have/hv these/dts laws/nns studied/vbn and/cc revised/vbn to/in the/at end/nn of/in modernizing/vbg and/cc improving/vbg them/ppo ' '/' ' ./.' palabras = nltk.word_tokenize(textoSucio) palabraSucias = [] for palabra in palabras: palabraSucias.append(str2tuple(palabra)) # print(palabraSucias) textoLimpio = untag(palabraSucias) textoLimpio = [palabra for palabra in textoLimpio if len(palabra) > 1] for texto in textoLimpio: if (texto == '``'): textoLimpio.remove(texto) # stopwords = set(nltk.corpus.stopwords.words('english')) # StopWords Configuracion # textoLimpio = [palabra for palabra in textoLimpio if palabra not in stopwords] print(textoLimpio)
def as_tuples(line): return [str2tuple(token, sep='_') for token in line.split(' ')]