def extract(rows, target_postags, target_structures, target_word=None, mongodb=True, VERBOSE=True): print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc') print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc') print '='*60 collect_cnt, skip_cnt = 0, 0 for entry in rows: ## extract rows sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep']) # read dependency and tree objs deps = dependency.read(raw_dep, return_type=dict) if not deps: continue tree = Tree(raw_tree) # collect certain dependency relations according to pre-specified pos tags ## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)] cdeps = extract_anchors(deps, tree, targets=target_postags) total_word_cnt += len(tree.pos()) anchor_word_cnt += len(cdeps) ## ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...] for (word, pos, idx) in cdeps: ## check if this is the target word if a target specified if target_word and word.lower() != target_word.lower(): if VERBOSE: print color.render('(ancher[x]) '+word+'-'+str(idx)+' #'+pos, 'b') continue ## extract dependency relations which match the target structures rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures) if rdeps: ## got deps match the target structures if VERBOSE: print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g') T = [ _transform_to_tuple(dep) for dep in rdeps] for (rel, (l, li), (r, ri)) in T: print ' ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y') print '='*60
def extract_and_save(rows, target_postags, target_structures, det_db_cfg, target_word=None, mongodb=True): lmtzr = WordNetLemmatizer() print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc') print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc') print '='*60 collect_cnt, skip_cnt = 0, 0 mc = pymongo.Connection(det_db_cfg['server_addr']) db = mc[det_db_cfg['db']] co = db[det_db_cfg['collection']] sent_cnt, total_word_cnt, anchor_word_cnt, anchor_word_structure_cnt = 0, 0, 0, 0 for entry in rows: ## extract rows sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep']) # read dependency and tree objs deps = dependency.read(raw_dep, return_type=dict) if not deps: continue tree = Tree(raw_tree) # collect certain dependency relations according to pre-specified pos tags ## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)] cdeps = extract_anchors(deps, tree, targets=target_postags) ## for stat sent_cnt += 1 total_word_cnt += len(tree.pos()) anchor_word_cnt += len(cdeps) ## ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...] for (word, pos, idx) in cdeps: ## check if this is the target word if a target specified if target_word and word.lower() != target_word.lower(): continue ## extract dependency relations which match the target structures rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures) if rdeps: ## got deps match the target structures print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g') T = [ _transform_to_tuple(dep) for dep in rdeps] for (rel, (l, li), (r, ri)) in T: print ' ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y') lemma = lmtzr.lemmatize(word, _getWordNetPOS(pos)) # generate mongo obj mongo_obj = {} mongo_obj['sid'] = sid # sentence id mongo_obj['word'] = word # anchor word mongo_obj['pos'] = pos # pos tag of word mongo_obj['idx'] = idx # word index mongo_obj['deps'] = rdeps # related deps mongo_obj['lemma'] = lemma # word lemma co.insert(mongo_obj) anchor_word_structure_cnt += 1 mc.close() print '='*60 print 'write statistic log' with open('stat.log','w') as fw: fw.write('total sent'+'\t'+str(sent_cnt)+'\n') fw.write('total word'+'\t'+str(total_word_cnt)+'\n') fw.write('anchor word'+'\t'+str(anchor_word_cnt)+'\n') fw.write('anchor word with structures'+'\t'+str(anchor_word_structure_cnt)+'\n')