def main(filepath): text = '' with codecs.open(filepath, 'r', 'utf-8') as f: text = f.read() text = pattern_punctuation.sub(' ', text) tagged_words = tag(text) word_counts = {} for tagword in tagged_words: transformed_word = transform(tagword) if transformed_word == None: continue if transformed_word in word_counts: word_counts[transformed_word] += 1 else: word_counts[transformed_word] = 1 tuple_list = sorted(word_counts.items(), key=lambda tupl_word: word_counts[tupl_word[0]], reverse=True) for tupl in tuple_list: print tupl[1], tupl[0]
def test_tag(self): # Assert [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")]. v = de.tag("der grosse Hund") self.assertEqual(v, [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")]) print "pattern.de.tag()"
# -*- coding: utf-8 -*- from pattern.de import parse, split, pprint, tag # from pprint import pprint # s = parse('Die Katze liegt auf der Matte.') # for sentence in split(s): # for word in sentence: # print(word) # pprint(sentence) pprint( parse('Die Katze liegt auf der Matte mit weniger als 10%.', tags=True, chunks=True, relations=True, lemmata=True, encoding='utf-8', tagset="STTS")) for word, pos in tag('Die Katze liegt auf der Matte mit weniger als 10%.', tagset="STTS"): if pos == "ARTDEF" or pos == "NN": print word + '\t' + pos
def test_tag(self): # Assert [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")]. v = de.tag("der grosse Hund") self.assertEqual(v, [("der", "DT"), ("grosse", "JJ"), ("Hund", "NN")]) print("pattern.de.tag()")
def tag_article(article_to_tag): new_list = [] for entry in tag(article_to_tag.content): new_list.append(TagTuple.TagTuple(entry[0], entry[1])) article_to_tag.tagged_content = new_list
dict_name_ne = 'dict_' + filename + '_ne' json_name_nn = dict_name_nn + '.json' json_name_ne = dict_name_ne + '.json' dict_name_nn = {} dict_name_ne = {} with open('testfile.txt', 'r') as openfile: read_text = openfile.read() parsetree_text = parsetree(read_text) # pprint(parsetree_text) # read_text = read_text.rstrip('\n') # print(re.findall(r'[\w]+|[.,!?;]', read_text)) # pprint(parse(read_text, tags=True, chunks=True, relations=True, # lemmata=True, encoding='utf-8', tagset='STTS')) for word, pos in tag(read_text, tagset='STTS'): if pos == 'NN': singularForm = singularize(word) if word == singularForm: pass # plural = True # print word + '\t' + singularForm + '\t' + str(plural) else: word = singularForm # plural = False # print word + '\t' + singularForm + '\t' + str(plural) if word not in dict_name_nn.keys(): dict_name_nn[word] = 1
def insertTagsToParsedDB(lastID, lastTitel, lastText): """Function to process the input, POS-tag it and write it in the DB dbparsedText.""" parsedDatabase = SQLFORM(db.dbParsedText) inputDatabase = SQLFORM(db.dbInput) dictNN = {} dictNE = {} # SQL Query to extract ID, Title and Text extractQueryInputDB = db.executesql('select id, inputTitle, inputText\ from dbInput') lastText = extractQueryInputDB[-1][-1] # Begin of For-Loop for POS-Tagging for word, postag in tag(lastText, tagset='STTS'): word = word.decode('utf-8') if postag == 'NN': singularFormNN = singularize(word) if word == singularFormNN: pass else: word = singularFormNN if word not in dictNN.keys(): dictNN[word] = 1 elif word in dictNN.keys(): dictNN[word] += 1 else: pass elif postag == 'NE': singularFormNE = singularize(word) if word == singularFormNE: pass else: word = singularFormNE if word not in dictNE.keys(): dictNE[word] = 1 elif word in dictNE.keys(): dictNE[word] += 1 else: pass else: pass listNN = dictNN.items() listNE = dictNE.items() # for key, value in dict710WC.iteritems(): # print key # print dict710WC # print 'Letzte ID: ' + str(lastID) # print 'Letzter Titel: ' + str(lastTitel) # print 'Letzter Text: ' + lastText # print '\n\n' # print dictNE # print '\n' # print dictNN # return extractQueryInputDB # return locals() return dictNE, dictNN
def buildWordList(): """ Function to build lists and dictionaries. Function exports txt-files incl. dictionaries and lists from both DDC for the words that are tagged as NE and NN. """ # Defining list variables list330NE = [] list330NN = [] list330 = [] list710NE = [] list710NN = [] list710 = [] # Defining dictionary variables dict330NE = {} dict330NN = {} dict330 = {} dict330WithoutCommons = {} dict710NE = {} dict710NN = {} dict710 = {} dict710WithoutCommons = {} for dirpath, dirs, files in os.walk('../../collecting/temp/'): for filename in fnmatch.filter(files, '*.txt'): with open('../../collecting/temp/' + dirpath + '/' + filename, 'r') as openfile: parsefile = openfile.read() # parsefile = parse(parsefile) ddcFromFilepath = dirpath[-3:] for word, postag in tag(parsefile, tagset='STTS'): word = word.decode('utf-8') if ddcFromFilepath == '330': if postag == 'NN': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list330NN.append(word) list330.append(word) if word not in dict330NN.keys(): dict330NN[word] = 1 elif word in dict330NN.keys(): dict330NN[word] += 1 else: pass elif postag == 'NE': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list330NE.append(word) list330.append(word) if word not in dict330NE.keys(): dict330NE[word] = 1 elif word in dict330NE.keys(): dict330NE[word] += 1 else: pass else: pass elif ddcFromFilepath == '710': if postag == 'NN': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list710NN.append(word) list710.append(word) if word not in dict710NN.keys(): dict710NN[word] = 1 elif word in dict710NN.keys(): dict710NN[word] += 1 else: pass elif postag == 'NE': singularForm = singularize(word) if word == singularForm: pass else: word = singularForm list710NE.append(word) list710.append(word) if word not in dict710NE.keys(): dict710NE[word] = 1 elif word in dict710NE.keys(): dict710NE[word] += 1 else: pass else: pass else: pass # Building list with words common in both DDCs listCommonWords = list(set(list330).intersection(list710)) # Building new lists without the common words list330WithoutCommons = [] list710WithoutCommons = [] for i in list330: if i not in listCommonWords: list330WithoutCommons.append(i) for i in list710: if i not in listCommonWords: list710WithoutCommons.append(i) # Building new dictionaries without the common words for i in list330WithoutCommons: if i not in dict330WithoutCommons.keys(): dict330WithoutCommons[i] = 1 elif i in dict330WithoutCommons.keys(): dict330WithoutCommons[i] += 1 else: pass for i in list710WithoutCommons: if i not in dict710WithoutCommons.keys(): dict710WithoutCommons[i] = 1 elif i in dict710WithoutCommons.keys(): dict710WithoutCommons[i] += 1 else: pass # Merge NE and NN dictionaries into one dictionary dict330.update(dict330NE) dict330.update(dict330NN) dict710.update(dict710NE) dict710.update(dict710NN) # Dump dictionaries into JSON files with open('../../collecting/dict330NE.json', 'w') as exportfile330NE: json.dump(dict330NE, exportfile330NE, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict330NN.json', 'w') as exportfile330NN: json.dump(dict330NN, exportfile330NN, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict330WithoutCommons.json', 'w') as\ exportfile330WithoutCommons: json.dump(dict330WithoutCommons, exportfile330WithoutCommons, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict330All.json', 'w') as exportfile330All: json.dump(dict330, exportfile330All, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710NE.json', 'w') as exportfile710NE: json.dump(dict710NE, exportfile710NE, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710NN.json', 'w') as exportfile710NN: json.dump(dict710NN, exportfile710NN, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710WithoutCommons.json', 'w') as\ exportfile710WithoutCommons: json.dump(dict710WithoutCommons, exportfile710WithoutCommons, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': ')) with open('../../collecting/dict710All.json', 'w') as exportfile710All: json.dump(dict710, exportfile710All, sort_keys=True, indent=4, ensure_ascii=False, separators=(',', ': '))