def show_confirm(workflow): for cmd in workflow: print >> sys.stderr, color.render('[cmd]', 'lc'), ' '.join(cmd) print >> sys.stderr, color.render('> correct? [Y/n] ', 'g'), ok = raw_input() print >> sys.stderr return ok
def calculate_pattern_scores(): total = co_lexicon.count() for i,mdoc in enumerate(co_lexicon.find()): pattern = mdoc['pattern'] percent = (i+1)/float(total)*100 if not config.verbose: sys.stderr.write('[%s> %.2f%%%s]\r' % ('='*(int(percent)+1), percent, ' '*(100-int(percent) ) ) ) sys.stderr.flush() else: logging.debug('[%.2f%%] (%d/%d) process %s' % (percent, i+1, total, color.render(pattern, 'ly') )) count = get_patcount(pattern) logging.debug('get count of "%s (%d)"' % (color.render(pattern,'g'), len(count) )) pattern_score = {} if not count else feature.pattern_scoring_function(count) mdoc = { 'score':pattern_score, 'pattern':pattern } logging.debug('insert mdoc in %s' % (color.render(co_patscore.full_name, 'ly') ) ) co_patscore.insert(mdoc) sys.stderr.write('\n') logging.info('create index on %s in %s' % (color.render('pattern', 'g'), color.render(co_patscore.full_name, 'ly') ) ) co_patscore.create_index("pattern")
def execute_workflow(workflow): for cmd in workflow: print >> sys.stderr, color.render('['+cmd[0].split('/')[-1]+']', 'r') print >> sys.stderr, color.render(' start '+'>'*10, 'y' ) retcode = subprocess.call(cmd, shell=False) print >> sys.stderr, color.render( ' end '+'<'*10, 'b') return True
def show_most_common(posU, posP, threshold=1.0, max_usage_cnt=10, min_cnt=10 ,min_percent=0.01): for anchor in posP: print '='*10,anchor,'='*10 itemP = sorted(posP[anchor].items(), key=lambda x:x[1], reverse=True) accum = 0.0 for (i, (usage, portion)) in enumerate(itemP): # if : break if i == max_usage_cnt: break if portion < min_percent and posU[anchor][usage] < min_cnt: break accum += portion colorful = [] for x in usage.split(): if x == 'PERSON': colorful.append(color.render('PERSON', 'g')) elif x == 'SOMETHING': colorful.append(color.render('SOMETHING', 'r')) elif re.match(r'#[^\.]+\.[a-z]', x): # "#familiar.j" colorful.append(color.render(x, 'lc')) else: colorful.append(x) colorful_usage = ' '.join(colorful) print colorful_usage,'\t', posU[anchor][usage],'\t', round(posP[anchor][usage]*100.0, 4), '%' if accum >= threshold: break
def execute_workflow(workflow): for cmd in workflow: print >> sys.stderr, color.render('[' + cmd[0].split('/')[-1] + ']', 'r') print >> sys.stderr, color.render(' start ' + '>' * 10, 'y') retcode = subprocess.call(cmd, shell=False) print >> sys.stderr, color.render(' end ' + '<' * 10, 'b') return True
def run(targets_rules): udocIDs = co_deps.distinct('udocID') MaxudocID = max(udocIDs) for targets, rule in targets_rules: rule_str = color.render(' '.join([str(x)+','+str(y) for x,y in rule]), 'lightblue') for udocID in udocIDs: logging.info(' process %d/%d; rule: %s' % (udocID, MaxudocID, rule_str)) doc = list(co_deps.find( {'udocID':udocID} )) ## extract all sentences in one document sents = extract_sents(doc) for sent in sents: ## extract patterns in each sentence pats = extract_pattern(sent, targets, rule) ## display results if config.verbose: sent_str = ' '.join([k[0] for k in sorted(set(reduce(lambda x,y:x+y, [((d['x'],d['xIdx']), (d['y'],d['yIdx'])) for d in sent])), key=lambda a:a[1])][1:] ) logging.debug('%s (%d)' % (sent_str, len(pats))) for p in pats: pat_str = ' '.join([x[0] for x in p['pat']]) logging.debug(' %s %.1f' % (color.render(pat_str.lower(), 'g'), p['weight'])) ## store back in mongo for p in pats: mdoc = { 'sent_length': sent[0]['sent_length'], 'udocID': sent[0]['udocID'], 'usentID': sent[0]['usentID'], config.category: sent[0][config.category], 'pattern_length': len(p['pat']), 'pattern': ' '.join([x[0] for x in p['pat']]), 'rule': p['matched_rule'], 'anchor': p['anchor'][0], 'anchor_type': p['anchor'][1], 'anchor_idx': p['anchor'][2], 'weight': p['weight'] } co_pats.insert(mdoc)
def setting_prompt(): for i, setting in enumerate(db[config.co_feature_setting_name].find()): if setting['feature_name'] == 'fusion': continue sid = str(setting['_id']) sids_map[i] = sid settings[sid] = setting sname = setting['feature_name'] print >> sys.stderr, i,'>', color.render( sid, 'yellow' ) for x in setting: if x == '_id': continue print >> sys.stderr, '\t' ,x, ':', color.render( str(setting[x]), 'g' ) print >> sys.stderr
def check_indexes(check_list, verbose=True): res = [] for co, idx_name in check_list: INDEXED = False current_idx_full_names = co.index_information().keys() for current_idx_full_name in current_idx_full_names: current_idx = '_'.join(current_idx_full_name.split('_')[:-1]) if current_idx == idx_name: INDEXED = True break if verbose: logging.info('collection: %s, index: %s (%s)' % (color.render(co.full_name, 'y'), color.render(idx_name,'g'), 'o' if INDEXED else 'x') ) if not INDEXED: co.create_index(idx_name) if verbose: logging.warn('create index on %s in %s' % (color.render(idx_name, 'g'), color.render(co.full_name, 'y') ))
def generate_feature_vectors(src_setting_ids): global co_feature_setting, eids, udocID_eid feature_pool = {} feature_vectors = {} # for each src_setting_id for src_setting_id in src_setting_ids: # find feature_name --> collection_name try: feature_name = co_feature_setting.find_one( {'_id': ObjectId(src_setting_id) } )['feature_name'] except: print 'check the format feature setting:',src_setting_id,'in mongodb' return False collection_name = 'features.' + feature_name ## use src_setting_id as prefix prefix = src_setting_id ## gathering number = db[collection_name].find({'setting':src_setting_id}).count() if number == 0: logging.error("can't find any instances with id "+ color.render(src_setting_id, 'y') + ' in ' + color.render(collection_name,'g')) return False for mdoc in db[collection_name].find({'setting':src_setting_id}): udocID = mdoc['udocID'] emotion = mdoc['emotion'] ## use emotion index as eid eid = eids[emotion] if eid not in feature_vectors: feature_vectors[eid] = defaultdict(list) ## save the mapping of udocID -> eid # udocID_gid[udocID] = eid feature = mdoc['feature'] if not feature: feature_vectors[eid][udocID] = [] else: for f_name, f_value in feature: # combine f_name with prefix f_name = '#'.join([prefix, f_name]) # generate fid if f_name not in feature_pool: feature_pool[f_name] = len(feature_pool) # get fid fid = feature_pool[f_name] feature_vectors[eid][udocID].append( (fid, f_value) ) return feature_vectors
def get_pattern_feature(udocID): sents = { x['usentID']:x['sent_length'] for x in list( co_sents.find( {'udocID': udocID} ) ) } usentID_offset = min(sents) total_words = sum([sents[x] for x in sents]) th1 = total_words * config.begPercentage/float(100) th2 = total_words * (config.begPercentage+config.midPercentage)/float(100) patFeature = Counter() ## find all pats in the document <udocID> pats = list( co_pats.find( {'udocID': udocID} ) ) if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % ( color.render('#' + str(udocID), 'y'), len(pats)) for pat in pats: if get_total_count(pat['pattern']) >= config.min_count: ## find pattern position ( beginning/middle/end ) lanchorID = sum([sents[usentID_offset+i] for i in range(pat['usentID'] - usentID_offset)]) + pat['anchor_idx'] if lanchorID <= th1: position = 'beginning' elif lanchorID <= th2: position = 'middle' else: position = 'end' key = '@'+ position + '_' + pat['pattern'] patFeature[ key ] += 1 return patFeature
def create_keyword_features(): ## list of emotions emotions = [x['emotion'] for x in co_emotions.find({'label': 'LJ40K'})] for (ie, gold_emotion) in enumerate(emotions): ## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing) docs = list(co_docs.find({'emotion': gold_emotion})) print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs)) for doc in docs: udocID = doc['udocID'] kw_feature = get_keyword_feature(udocID).items() mdoc = { "emotion": gold_emotion, "udocID": doc['udocID'], "feature": kw_feature, "setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e" } co_feature.insert(mdoc) co_feature.create_index("setting")
def get_keyword_feature(udocID): keywordFeature = Counter() ## find all words in the document <udocID> words = [] POSs = [] sent_mdocs = list(co_sents.find({'udocID': udocID})) for sent_mdoc in sent_mdocs: words.extend(sent_mdoc['sent'].split(' ')) # words: list of 'happy' POSs.extend( sent_mdoc['sent_pos'].split(' ')) # POSs: list of 'happy/JJ' if config.verbose: print >> sys.stderr, '\t%s (%d words)\t' % (color.render( '#' + str(udocID), 'y'), len(words)) for idx, word in enumerate(words): word = word.lower() if config.lemma: POS = POSs[idx].split('/').pop() if POS.startswith('J'): pos = 'a' elif POS.startswith('V'): pos = 'v' elif POS.startswith('R'): pos = 'r' else: pos = 'n' word = lmtzr.lemmatize(word, pos) if word in keyword_list: keywordFeature[word] += 1 return keywordFeature
def get_keyword_feature(udocID): keywordFeature = Counter() ## find all words in the document <udocID> words = [] POSs = [] sent_mdocs = list( co_sents.find( {'udocID': udocID} ) ) for sent_mdoc in sent_mdocs: words.extend( sent_mdoc['sent'].split(' ') ) # words: list of 'happy' POSs.extend( sent_mdoc['sent_pos'].split(' ') ) # POSs: list of 'happy/JJ' if config.verbose: print >> sys.stderr, '\t%s (%d words)\t' % ( color.render('#' + str(udocID), 'y'), len(words)) for idx, word in enumerate(words): word = word.lower() if config.lemma: POS = POSs[idx].split('/').pop() if POS.startswith('J'): pos = 'a' elif POS.startswith('V'): pos = 'v' elif POS.startswith('R'): pos = 'r' else: pos = 'n' word = lmtzr.lemmatize(word, pos) if word in keyword_list: keywordFeature[ word ] += 1 return keywordFeature
def get_document_feature(udocID): sents = { x['usentID']:x['sent_length'] for x in list( co_sents.find( {'udocID': udocID} ) ) } usentID_offset = min(sents) total_words = sum([sents[x] for x in sents]) th1 = total_words * config.begPercentage/float(100) th2 = total_words * (config.begPercentage+config.midPercentage)/float(100) # print sents, '\ntotal_words = ', total_words, '\nusentID_offset = ', usentID_offset, '\nth1 = ', th1, '\nth2 = ', th2 docfeature = Counter() ## find all pats in the document <udocID> pats = list( co_pats.find( {'udocID': udocID} ) ) if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % ( color.render('#' + str(udocID), 'y'), len(pats)) for pat in pats: ## find pattern position ( beginning/middle/end ) lanchorID = sum([sents[usentID_offset+i] for i in range(pat['usentID'] - usentID_offset)]) + pat['anchor_idx'] if lanchorID <= th1: position = 'beginning' elif lanchorID <= th2: position = 'middle' else: position = 'end' # print '='*30, '\n', pat['pattern'], '\n', 'lanchorID = ', lanchorID, '\n', 'position = ', position patfeature = get_patfeature(udocID, position, pat['pattern']) for e in patfeature: key = '#position'+ '@'+ position + '_' + e docfeature[key] += patfeature[e] return docfeature
def document_scoring(udocID): # find all pats in the document <udocID> pats = list(co_pats.find({'udocID': udocID})) global search_list if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % (color.render( '#' + str(udocID), 'y'), len(pats)) D = defaultdict(list) # calculate the event score in each pattern for pat in pats: # ignore patterns with occurrence less than x # use -l x or --limit x to specify if search_list: if pat['pattern'] not in search_list: continue EventScores = event_scoring(pat) for emotion in EventScores: D[emotion].append(EventScores[emotion]) scores = dict([(e, sum(D[e]) / float(len(D[e]))) for e in D]) return scores
def document_scoring(udocID): # find all pats in the document <udocID> pats = list( co_pats.find( {'udocID': udocID} ) ) global search_list if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % ( color.render('#' + str(udocID), 'y'), len(pats)) D = defaultdict(list) # calculate the event score in each pattern for pat in pats: # ignore patterns with occurrence less than x # use -l x or --limit x to specify if search_list: if pat['pattern'] not in search_list: continue EventScores = event_scoring(pat) for emotion in EventScores: D[emotion].append( EventScores[emotion] ) scores = dict([(e, sum(D[e])/float(len(D[e])) ) for e in D]) return scores
def check_indexes(target, indexes, auto=True): # |-- idx_name --| |--------- idx_value --------------| # [(u'_id_', {u'key': [(u'_id', 1)], u'v': 1}), # (u'pattern_1', {u'key': [(u'pattern', 1.0)], u'v': 1})] existed = set() for (idx_name, idx_value) in target.index_information().items(): for idx_str, idx_n in idx_value['key']: idx_n = int(idx_n) if idx_str in indexes: existed.add(idx_str) ## index's not fully-functional if len(existed) < len(set(indexes)): to_be_created = [x for x in indexes if x not in existed] # index_to_be_created = color.render( ', '.join([x for x in to_be_created]), 'green') to_be_created_str = color.render(', '.join(to_be_created), 'g') if auto: print >> sys.stderr, '(warning) missing necessary index(es)', to_be_created_str for idx_str in to_be_created: print >> sys.stderr, 'automatically creating index', idx_str, '...', sys.stderr.flush() ## create index in target collection target.create_index(idx_str) print >> sys.stderr, 'done' else: print >> sys.stderr, '(error) please manully create index(es)', to_be_created_str, 'first before calculating the score' return False return True
def check_indexes(target, indexes, auto=True): # |-- idx_name --| |--------- idx_value --------------| # [(u'_id_', {u'key': [(u'_id', 1)], u'v': 1}), # (u'pattern_1', {u'key': [(u'pattern', 1.0)], u'v': 1})] existed = set() for (idx_name, idx_value) in target.index_information().items(): for idx_str, idx_n in idx_value['key']: idx_n = int(idx_n) if idx_str in indexes: existed.add( idx_str ) ## index's not fully-functional if len(existed) < len(set(indexes)): to_be_created = [x for x in indexes if x not in existed] # index_to_be_created = color.render( ', '.join([x for x in to_be_created]), 'green') to_be_created_str = color.render(', '.join(to_be_created), 'g') if auto: print >> sys.stderr, '(warning) missing necessary index(es)', to_be_created_str for idx_str in to_be_created: print >> sys.stderr, 'automatically creating index', idx_str, '...', sys.stderr.flush() ## create index in target collection target.create_index(idx_str) print >> sys.stderr, 'done' else: print >> sys.stderr, '(error) please manully create index(es)',to_be_created_str,'first before calculating the score' return False return True
def check_destination(pathes, token, ext='txt'): if not os.path.exists(pathes['_root_']): os.mkdir(pathes['_root_']) new_pathes = {} for ftype in pathes: if ftype.startswith('_') and ftype.endswith('_'): continue fn = pathes[ftype] ## auto-generated filename fn = fn if fn else '.'.join([token, ftype, ext]) ## check if destination path already exists # join root to yield destination path dest_path = os.path.join(pathes['_root_'], fn) new_pathes[ftype] = dest_path ## destination's already existed if os.path.exists(dest_path) and not config.overwrite: if not fusion_all: print >> sys.stderr, '[error] destination file', color.render(dest_path, 'red') ,'is already existed' print >> sys.stderr, ' use -o or --overwrite to force overwrite' exit(-1) else: return False return new_pathes
def show(collected): for anchor in collected: print '='*10, anchor, '='*10 for (i, (usage, usage_cnt, usage_portion)) in enumerate(collected[anchor]): colorful = [] for x in usage.split(): if x == 'PERSON': colorful.append(color.render('PERSON', 'g')) elif x == 'SOMETHING': colorful.append(color.render('SOMETHING', 'r')) elif re.match(r'#[^\.]+\.[a-z]', x): # "#familiar.j" colorful.append(color.render(x, 'lc')) else: colorful.append(x) colorful_usage = ' '.join(colorful) print colorful_usage,'\t', usage_cnt,'\t', round(usage_portion*100.0, 4), '%'
def find_intersection(eval_mdoc): LJ40K = sorted([x['emotion'] for x in db['emotions'].find({'label':'LJ40K'}) ]) Mishne05 = sorted([x['emotion'] for x in db['emotions'].find({'label':'Mishne05'}) ]) inter = [] for e in set(LJ40K+Mishne05): if e in LJ40K and e in Mishne05: inter.append(e) inter_accuracy = {} for e in eval_mdoc['accuracy']: if e in inter: inter_accuracy[e] = eval_mdoc['accuracy'][e] pprint(inter_accuracy) print 'avg accuracy in overall\t\t', color.render( str(eval_mdoc['avg_accuracy']), 'g') print 'avg accuracy in intersection\t', color.render( str(round( sum(inter_accuracy.values())/float(len(inter_accuracy.values())), 4)), 'y')
def get_keyword_feature(udocID): sents = { x['usentID']: x['sent_length'] for x in list(co_sents.find({'udocID': udocID})) } total_words = sum([sents[x] for x in sents]) th1 = total_words * config.begPercentage / float(100) th2 = total_words * (config.begPercentage + config.midPercentage) / float(100) keywordFeature = Counter() ## find all words in the document <udocID> words = [] POSs = [] wordIDs = [] sent_mdocs = list(co_sents.find({'udocID': udocID})) for sent_mdoc in sent_mdocs: ## words: list of 'happy' words.extend(sent_mdoc['sent'].split(' ')) ## POSs: list of 'happy/JJ' POSs.extend(sent_mdoc['sent_pos'].split(' ')) ## wordIDs: list of 'word id' wordID_offset = 0 for key in sents: if key < sent_mdoc['usentID']: wordID_offset += sents[key] wordIDs.extend([(x + 1 + wordID_offset) for x in range(sents[sent_mdoc['usentID']])]) if config.verbose: print >> sys.stderr, '\t%s (%d words)\t' % (color.render( '#' + str(udocID), 'y'), len(words)) for idx, word in enumerate(words): word = word.lower() if config.lemma: POS = POSs[idx].split('/').pop() if POS.startswith('J'): pos = 'a' elif POS.startswith('V'): pos = 'v' elif POS.startswith('R'): pos = 'r' else: pos = 'n' word = lmtzr.lemmatize(word, pos) if wordIDs[idx] <= th1: position = 'beginning' elif wordIDs[idx] <= th2: position = 'middle' else: position = 'end' if word in keyword_list: key = '@' + position + '_' + word keywordFeature[key] += 1 return keywordFeature
def find_intersection(eval_mdoc): LJ40K = sorted([x['emotion'] for x in db['emotions'].find({'label':'LJ40K'}) ]) Mishne05 = sorted([x['emotion'] for x in db['emotions'].find({'label':'Mishne05'}) ]) inter = [] for e in set(LJ40K+Mishne05): if e in LJ40K and e in Mishne05: inter.append(e) inter_accuracy = {} for e in eval_mdoc['accuracy']: if e in inter: inter_accuracy[e] = eval_mdoc['accuracy'][e] pprint(eval_mdoc['accuracy']) print >> sys.stderr, 'avg accuracy in overall\t\t', color.render( str(eval_mdoc['avg_accuracy']), 'g') print >> sys.stderr, 'avg accuracy in intersection\t', color.render( str(round( sum(inter_accuracy.values())/float(len(inter_accuracy.values())), 4)), 'y')
def update_all_document_scores(): global search_list search_list = get_search_list() emotions = [x['emotion'] for x in co_emotions.find({'label': 'LJ40K'})] ## drop docscore collection if overwrite is enabled if config.overwrite: print >> sys.stderr, 'drop collection', config.co_docscore_name co_docscore.drop() for (ie, gold_emotion) in enumerate(emotions): ## get all document with emotions <gold_emotion> and ldocID is great than 800 docs = list( co_docs.find({ 'emotion': gold_emotion, 'ldocID': { '$gte': 800 } })) if config.verbose: print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs)) else: print >> sys.stderr, '%d > %s' % (ie, color.render(gold_emotion, 'g')) for doc in docs: # score a document in 40 diff emotions scores = document_scoring(doc['udocID']) mdoc = { 'udocID': doc['udocID'], 'gold_emotion': gold_emotion, 'scores': scores } co_docscore.insert(mdoc)
def choose(): while True: print >> sys.stderr,'> choose setting ID(s) [ 0 ~ '+str(len(sids_map)-1)+' ]: ', str_sids = raw_input() try: setting_ids = map(lambda x: sids_map[int(x)], re.findall(r'([0-9]+)\s*,?', str_sids)) break except: print >> sys.stderr, color.render( '\n[error] input CORRECT format: e.g., "0,1,2" or "1", and valid index number\n', 'r' ) print >> sys.stderr return setting_ids
def calculate_pattern_scores_remove_self(category): ## list of category categories = [ x[category] for x in co_cate.find( { 'label': category } ) ] logging.debug('found %d categories' % len(categories)) for (ie, gold_category) in enumerate(categories): ## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing) docs = list( co_docs.find( { category: gold_category } ) ) logging.info('%d/%d %s: %d docs' % ( ie, len(categories), color.render(gold_category, 'lg'), len(docs) )) for ith_doc, doc in enumerate(docs): udocID = doc['udocID'] ## find all pats in the document <udocID> pats = list( co_pats.find( {'udocID': udocID} ) ) logging.info('%s --> %s (%d pats) [%d/%d]\t%.1f%%' % ( color.render(gold_category, 'lg'), color.render(str(udocID),'ly'), len(pats), ith_doc+1, len(docs), (ith_doc+1)/float(len(docs))*100 ) ) for pat in pats: pattern_score = {} pattern = pat['pattern'] count = get_patcount(pattern) logging.debug('get count of "%s (%d)"' % (color.render(pattern,'g'), len(count) )) if count: count = remove_self_count(udocID, pattern, count, category=config.category) logging.debug('remove self count of "%s" in udocID: %s' % (color.render(pattern,'g'), color.render(str(udocID),'lc')) ) pattern_score = feature.pattern_scoring_function(count) mdoc = { 'score':pattern_score, 'udocID':udocID, 'pattern':pattern } co_patscore.insert(mdoc) co_patscore.create_index("pattern")
def get_keyword_feature(udocID): sents = { x['usentID']:x['sent_length'] for x in list( co_sents.find( {'udocID': udocID} ) ) } total_words = sum([sents[x] for x in sents]) th1 = total_words * config.begPercentage/float(100) th2 = total_words * (config.begPercentage+config.midPercentage)/float(100) keywordFeature = Counter() ## find all words in the document <udocID> words = [] POSs = [] wordIDs = [] sent_mdocs = list( co_sents.find( {'udocID': udocID} ) ) for sent_mdoc in sent_mdocs: ## words: list of 'happy' words.extend( sent_mdoc['sent'].split(' ') ) ## POSs: list of 'happy/JJ' POSs.extend( sent_mdoc['sent_pos'].split(' ') ) ## wordIDs: list of 'word id' wordID_offset = 0 for key in sents: if key < sent_mdoc['usentID']: wordID_offset += sents[key] wordIDs.extend( [ (x+1+wordID_offset) for x in range(sents[ sent_mdoc['usentID'] ]) ] ) if config.verbose: print >> sys.stderr, '\t%s (%d words)\t' % ( color.render('#' + str(udocID), 'y'), len(words)) for idx, word in enumerate(words): word = word.lower() if config.lemma: POS = POSs[idx].split('/').pop() if POS.startswith('J'): pos = 'a' elif POS.startswith('V'): pos = 'v' elif POS.startswith('R'): pos = 'r' else: pos = 'n' word = lmtzr.lemmatize(word, pos) if wordIDs[idx] <= th1: position = 'beginning' elif wordIDs[idx] <= th2: position = 'middle' else: position = 'end' if word in keyword_list: key = '@'+ position + '_' + word keywordFeature[ key ] += 1 return keywordFeature
def load_data(): global answers, golds, labels paths = search_files() if paths: print >> sys.stderr, '[path] [confusion_matrix.py] path for out:', paths['out'] print >> sys.stderr, '[path] [confusion_matrix.py] path for gold:',paths['gold'] answers = [line.strip().split('\t')[0] for line in open(paths['out'])] golds = [line.strip().split('\t')[0] for line in open(paths['gold'])] labels = { line.strip().split('\t')[0]:line.strip().split('\t')[-1] for line in open(paths['gold']) } else: print >> sys.stderr, color.render('[erorr] [confusion_matrix.py] cannot find the files.', 'r')
def run(): global co_feature_setting # collection pointer of feature settings co_feature_setting = db[config.co_feature_setting_name] # sorted src_setting_id src_setting_ids = parse_src_setting_ids() dest_setting_id = obtain_dest_setting_id(src_setting_ids) dest_paths = get_dest_paths(dest_setting_id) ## logging logging.debug('src_setting_ids: '+color.render(','.join(src_setting_ids), 'y') ) for ftype, fn in sorted(dest_paths.items()): logging.debug( ftype+': '+color.render(fn, 'g') ) logging.info('dest_setting_id: '+color.render(dest_setting_id, 'y') ) # files are all existed if is_dest_files_exist(dest_paths) and not config.overwrite: logging.info('all files are existed') # files are not all existed else: logging.info('generate feature vectors') feature_vectors = generate_feature_vectors(src_setting_ids) if not feature_vectors: exit(-1) logging.info('transform to svm format') str_feature_vectors = tranform_to_svm_format(feature_vectors) logging.info('generate train/test files') generate_train_test_files(str_feature_vectors, dest_paths) return True
def extract(rows, target_postags, target_structures, target_word=None, mongodb=True, VERBOSE=True): print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc') print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc') print '='*60 collect_cnt, skip_cnt = 0, 0 for entry in rows: ## extract rows sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep']) # read dependency and tree objs deps = dependency.read(raw_dep, return_type=dict) if not deps: continue tree = Tree(raw_tree) # collect certain dependency relations according to pre-specified pos tags ## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)] cdeps = extract_anchors(deps, tree, targets=target_postags) total_word_cnt += len(tree.pos()) anchor_word_cnt += len(cdeps) ## ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...] for (word, pos, idx) in cdeps: ## check if this is the target word if a target specified if target_word and word.lower() != target_word.lower(): if VERBOSE: print color.render('(ancher[x]) '+word+'-'+str(idx)+' #'+pos, 'b') continue ## extract dependency relations which match the target structures rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures) if rdeps: ## got deps match the target structures if VERBOSE: print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g') T = [ _transform_to_tuple(dep) for dep in rdeps] for (rel, (l, li), (r, ri)) in T: print ' ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y') print '='*60
def update_all_document_scores(): global search_list search_list = get_search_list() emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ] ## drop docscore collection if overwrite is enabled if config.overwrite: print >> sys.stderr, 'drop collection', config.co_docscore_name co_docscore.drop() for (ie, gold_emotion) in enumerate(emotions): ## get all document with emotions <gold_emotion> and ldocID is great than 800 docs = list( co_docs.find( { 'emotion': gold_emotion, 'ldocID': {'$gte': 800}} ) ) if config.verbose: print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) ) else: print >> sys.stderr, '%d > %s' % ( ie, color.render(gold_emotion, 'g') ) for doc in docs: # score a document in 40 diff emotions scores = document_scoring(doc['udocID']) mdoc = { 'udocID': doc['udocID'], 'gold_emotion': gold_emotion, 'scores': scores } co_docscore.insert( mdoc )
def print_confirm(confirm_msg, bar=40, halt=True): for msg in confirm_msg: msg = list(msg) if len(msg) > 1: for i in range(len(msg)-1): if type(msg[i+1]) == bool: if msg[i+1] == False: msg[i+1] = color.render(str(msg[i+1]), color_for[bool][False]) else: msg[i+1] = color.render(str(msg[i+1]),color_for[bool][True]) if len(msg) == 3 and type(msg[2]) == dict: print >> sys.stderr, msg[0], ':', msg[1], msg[2][msg[1]] elif len(msg) == 3 and type(msg[2]) == str: print >> sys.stderr, msg[0], ':', msg[1], msg[2] elif len(msg) == 2: print >> sys.stderr, msg[0], ':', msg[1] else: print >> sys.stderr, msg print >> sys.stderr, '='*bar if halt: print >> sys.stderr, 'press any key to start...', raw_input()
def get_pattern_feature(udocID): patFeature = Counter() ## find all pats in the document <udocID> pats = list( co_pats.find( {'udocID': udocID} ) ) if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % ( color.render('#' + str(udocID), 'y'), len(pats)) for pat in pats: if get_count(pat['pattern']) >= config.min_count: patFeature[ pat['pattern'] ] += 1 return patFeature
def build_lexicon(): print 'type: ', wordType print 'lemma: ', str(lemma) keyword_list = [ x['word'] for x in list(co_keywords.find({'type': wordType})) ] keywordCount = defaultdict(Counter) for (ie, e) in enumerate(emotions): print >> sys.stderr, '%d > %s' % (ie, color.render(e, 'g')) for doc in co_docs.find({'emotion': e, 'ldocID': {'$lt': 800}}): udocID = doc['udocID'] mdocs = list(co_sents.find({'udocID': udocID})) for mdoc in mdocs: words = mdoc['sent'].split(' ') POSs = [ x.split('/').pop() for x in mdoc['sent_pos'].split(' ') ] for idx, word in enumerate(words): word = word.lower() if lemma: if POSs[idx].startswith('N'): pos = 'n' elif POSs[idx].startswith('V'): pos = 'v' elif POSs[idx].startswith('J'): pos = 'a' elif POSs[idx].startswith('R'): pos = 'r' else: pos = None if pos: word = lmtzr.lemmatize(word, pos) if word in keyword_list: keywordCount[word][e] += 1 for word in keywordCount: mdoc = {'keyword': word, 'count': keywordCount[word]} co_keyword_lexicon.insert(mdoc)
def get_pattern_feature(udocID): patFeature = Counter() ## find all pats in the document <udocID> pats = list(co_pats.find({'udocID': udocID})) if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % (color.render( '#' + str(udocID), 'y'), len(pats)) for pat in pats: if get_count(pat['pattern']) >= config.min_count: patFeature[pat['pattern']] += 1 return patFeature
def build_lexicon(): print 'type: ', wordType print 'lemma: ', str(lemma) keyword_list = [ x['word'] for x in list( co_keywords.find({ 'type': wordType }) ) ] keywordCount = defaultdict(Counter) for (ie, e) in enumerate(emotions): print >> sys.stderr, '%d > %s' % ( ie, color.render(e, 'g') ) for doc in co_docs.find( { 'emotion': e, 'ldocID': {'$lt': 800}} ): udocID = doc['udocID'] mdocs = list( co_sents.find( {'udocID': udocID} ) ) for mdoc in mdocs: words = mdoc['sent'].split(' ') POSs = [ x.split('/').pop() for x in mdoc['sent_pos'].split(' ') ] for idx, word in enumerate(words): word = word.lower() if lemma: if POSs[idx].startswith('N'): pos = 'n' elif POSs[idx].startswith('V'): pos = 'v' elif POSs[idx].startswith('J'): pos = 'a' elif POSs[idx].startswith('R'): pos = 'r' else: pos = None if pos: word = lmtzr.lemmatize(word, pos) if word in keyword_list: keywordCount[word][e] += 1 for word in keywordCount: mdoc = { 'keyword': word, 'count': keywordCount[word] } co_keyword_lexicon.insert( mdoc )
def get_document_feature(udocID): docfeature = Counter() ## find all pats in the document <udocID> pats = list( co_pats.find( {'udocID': udocID} ) ) if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % ( color.render('#' + str(udocID), 'y'), len(pats)) for pat in pats: patfeature = get_patfeature(pat['pattern'], udocID) for e in patfeature: docfeature[e] += patfeature[e] return docfeature
def create_keyword_TFIDF_features(setting_id, training_TFIDF, testing_TFIDF): ## list of emotions emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ] for (ie, gold_emotion) in enumerate(emotions): ## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing) docs = list( co_docs.find( { 'emotion': gold_emotion } ) ) if config.verbose: print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) ) for doc in docs: udocID = doc['udocID'] ldocID = doc['ldocID'] if ldocID < 800: # training if udocID in training_TFIDF: if keyword_mode: feature = [(t, training_TFIDF[udocID][t]) for t in training_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list else: feature = dict(training_TFIDF[udocID]).items() ## no specified keyword list else: feature = [] else: if udocID in testing_TFIDF: if keyword_mode: feature = [(t, testing_TFIDF[udocID][t]) for t in testing_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list else: feature = dict(testing_TFIDF[udocID]).items() else: feature = [] mdoc = { "emotion": gold_emotion, "udocID": udocID, "feature": feature, "setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e" } co_feature.insert(mdoc) co_feature.create_index("setting")
def get_document_feature(udocID): docfeature = Counter() ## find all pats in the document <udocID> pats = list(co_pats.find({'udocID': udocID})) if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % (color.render( '#' + str(udocID), 'y'), len(pats)) for pat in pats: patfeature = get_patfeature(pat['pattern'], udocID) for e in patfeature: docfeature[e] += patfeature[e] return docfeature
def run(sid): c = Counter() root = os.path.join('tmp', sid) if not os.path.exists(root): os.makedirs(root) src_paths = {} for ftype in ('train', 'test', 'gold'): src_fn = '.'.join([sid,ftype,'txt']) src_path = os.path.join('tmp', src_fn) if not os.path.exists(src_path): print 'missing', src_path, 'run toSVM.py before transforming to binary' exit(-1) src_paths[ftype] = src_path if dest_files_exist(sid, root) and not config.overwrite: print 'all destination files existed' exit(0) ## load source files data = load_src_files(src_paths) ## get all labels labels = set([x[0] for x in data['train']]) for anchor in labels: # for each gold label, transform to binary binary_data = to_binary(data, anchor) print 'generating binary data for label', color.render(str(anchor), 'g') for ftype in data: dest_fn = '.'.join([anchor, 'b', ftype]) dest_path = os.path.join(root, dest_fn) binary_labeled_feature = binary_data[ftype] with open(dest_path, 'w') as fw: for line_list in binary_labeled_feature: line_str = ' '.join(line_list) + '\n' fw.write(line_str)
def run(sid): c = Counter() root = os.path.join('tmp', sid) if not os.path.exists(root): os.makedirs(root) src_paths = {} for ftype in ('train', 'test', 'gold'): src_fn = '.'.join([sid, ftype, 'txt']) src_path = os.path.join('tmp', src_fn) if not os.path.exists(src_path): print 'missing', src_path, 'run toSVM.py before transforming to binary' exit(-1) src_paths[ftype] = src_path if dest_files_exist(sid, root): exit(0) ## load source files data = load_src_files(src_paths) ## get all labels labels = set([x[0] for x in data['train']]) for anchor in labels: # for each gold label, transform to binary binary_data = to_binary(data, anchor) print 'generating binary data for label', color.render( str(anchor), 'g') for ftype in data: dest_fn = '.'.join([anchor, 'b', ftype]) dest_path = os.path.join(root, dest_fn) binary_labeled_feature = binary_data[ftype] with open(dest_path, 'w') as fw: for line_list in binary_labeled_feature: line_str = ' '.join(line_list) + '\n' fw.write(line_str)
def get_document_feature(udocID): sents = { x['usentID']: x['sent_length'] for x in list(co_sents.find({'udocID': udocID})) } usentID_offset = min(sents) total_words = sum([sents[x] for x in sents]) th1 = total_words * config.begPercentage / float(100) th2 = total_words * (config.begPercentage + config.midPercentage) / float(100) # print sents, '\ntotal_words = ', total_words, '\nusentID_offset = ', usentID_offset, '\nth1 = ', th1, '\nth2 = ', th2 docfeature = Counter() ## find all pats in the document <udocID> pats = list(co_pats.find({'udocID': udocID})) if config.verbose: print >> sys.stderr, '\t%s (%d pats)\t' % (color.render( '#' + str(udocID), 'y'), len(pats)) for pat in pats: ## find pattern position ( beginning/middle/end ) lanchorID = sum([ sents[usentID_offset + i] for i in range(pat['usentID'] - usentID_offset) ]) + pat['anchor_idx'] if lanchorID <= th1: position = 'beginning' elif lanchorID <= th2: position = 'middle' else: position = 'end' # print '='*30, '\n', pat['pattern'], '\n', 'lanchorID = ', lanchorID, '\n', 'position = ', position patfeature = get_patfeature(udocID, position, pat['pattern']) for e in patfeature: key = '#position' + '@' + position + '_' + e docfeature[key] += patfeature[e] return docfeature
def create_keyword_TFIDF_features(setting_id, training_TFIDF, testing_TFIDF): ## list of emotions emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ] for (ie, gold_emotion) in enumerate(emotions): ## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing) docs = list( co_docs.find( { 'emotion': gold_emotion } ) ) if config.verbose: print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) ) for doc in docs: udocID = doc['udocID'] ldocID = doc['ldocID'] if ldocID < 800: # training if udocID in training_TFIDF: # feature = dict(training_TFIDF[udocID]).items() ## no specified keyword list feature = [(t, training_TFIDF[udocID][t]) for t in training_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list else: feature = [] else: if udocID in testing_TFIDF: # feature = dict(testing_TFIDF[udocID]).items() feature = [(t, testing_TFIDF[udocID][t]) for t in testing_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list else: feature = [] mdoc = { "emotion": gold_emotion, "udocID": udocID, "feature": feature, "setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e" } co_feature.insert(mdoc) co_feature.create_index("setting")
def create_keyword_features(): ## list of emotions emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ] for (ie, gold_emotion) in enumerate(emotions): ## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing) docs = list( co_docs.find( { 'emotion': gold_emotion } ) ) print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) ) for doc in docs: mdoc = { "emotion": gold_emotion, "udocID": doc['udocID'], "feature": get_keyword_feature(udocID=doc['udocID']).items(), "setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e" } co_feature.insert(mdoc) co_feature.create_index("setting")
def extract_and_save(rows, target_postags, target_structures, det_db_cfg, target_word=None, mongodb=True): lmtzr = WordNetLemmatizer() print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc') print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc') print '='*60 collect_cnt, skip_cnt = 0, 0 mc = pymongo.Connection(det_db_cfg['server_addr']) db = mc[det_db_cfg['db']] co = db[det_db_cfg['collection']] sent_cnt, total_word_cnt, anchor_word_cnt, anchor_word_structure_cnt = 0, 0, 0, 0 for entry in rows: ## extract rows sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep']) # read dependency and tree objs deps = dependency.read(raw_dep, return_type=dict) if not deps: continue tree = Tree(raw_tree) # collect certain dependency relations according to pre-specified pos tags ## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)] cdeps = extract_anchors(deps, tree, targets=target_postags) ## for stat sent_cnt += 1 total_word_cnt += len(tree.pos()) anchor_word_cnt += len(cdeps) ## ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...] for (word, pos, idx) in cdeps: ## check if this is the target word if a target specified if target_word and word.lower() != target_word.lower(): continue ## extract dependency relations which match the target structures rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures) if rdeps: ## got deps match the target structures print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g') T = [ _transform_to_tuple(dep) for dep in rdeps] for (rel, (l, li), (r, ri)) in T: print ' ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y') lemma = lmtzr.lemmatize(word, _getWordNetPOS(pos)) # generate mongo obj mongo_obj = {} mongo_obj['sid'] = sid # sentence id mongo_obj['word'] = word # anchor word mongo_obj['pos'] = pos # pos tag of word mongo_obj['idx'] = idx # word index mongo_obj['deps'] = rdeps # related deps mongo_obj['lemma'] = lemma # word lemma co.insert(mongo_obj) anchor_word_structure_cnt += 1 mc.close() print '='*60 print 'write statistic log' with open('stat.log','w') as fw: fw.write('total sent'+'\t'+str(sent_cnt)+'\n') fw.write('total word'+'\t'+str(total_word_cnt)+'\n') fw.write('anchor word'+'\t'+str(anchor_word_cnt)+'\n') fw.write('anchor word with structures'+'\t'+str(anchor_word_structure_cnt)+'\n')
print '='*60 if __name__ == '__main__': ######## sqlite version ######## # db_path = 'data/bnc.db3' # sql = "select * from BNC_Parsed where sent like ?" # args = ['%'+'interested'+'%'] # rows = fetch_sqlite(db_path, sql, args) ######## mongo version ######## doraemon = 'doraemon.iis.sinica.edu.tw' db_info = {'name': 'BNC', 'collection': 'Parsed'} # connect to mongo server print >> sys.stderr, color.render('fetching data','r'), '...', sys.stderr.flush() cur = fetch_mongo(doraemon, db_info, None) print >> sys.stderr, color.render('done','g') # get fetched data # print >> sys.stderr, color.render('limiting data','r'), '...', # sys.stderr.flush() # rows = cur.limit(1000) # print >> sys.stderr, color.render('done','g') ## pre-specified target pos tags target_postags = ['JJ', 'VB', 'NN'] ## pre-specified structures ## 1: necessary
def main(argv, halt=False): # default value target = 'familiar' rule = [('subj', 1), ('cop', 1), ('prep', 1)] limit = -1 dump = False var = _extract_opt(argv) target = target if not var['target'] else var['target'].strip() rule = rule if not var['rule'] else eval(var['rule']) limit = limit if not var['limit'] else int(var['limit']) dump = dump if not var['dump'] else var['dump'] print >> sys.stderr, color.render("target:",'lc'),target print >> sys.stderr, color.render("rule:",'lc'),rule print >> sys.stderr, color.render("limit:",'lc'),limit print >> sys.stderr, color.render("dump:",'lc'),dump if halt: print >> sys.stderr, 'press to begin ...',raw_input() ## ------------------------------ main program ------------------------------ R = coDeps.find({'lemma': target}) if limit < 0 else coDeps.find({'lemma': target}).limit(limit) for entry in R: # get dependency relations deps = entry['deps'] # fetch original sentence info (including raw tree) to obtain pos tags raw = list(coParsed.find( {'id':entry['sid']} ))[0] tree = Tree(raw['tree']) # filter deps by pre-defined rule # and yield a dictionary with rel<str> as key, deps<list> as value rels = apply_rule(deps, rule) if not rels: continue combs = ListCombination(rels.values()) # calculate weight of each combination weight = 1/float(len(combs)) if len(combs) > 1 else 1.0 # form the anchor element using (word, index pair) anchor = (entry['word'], entry['idx']) # collect existing patterns object, ready to append new found patterns patterns = [] if 'patterns' not in entry else entry['patterns'] # print 'sid >', for comb in combs: words = form(comb, anchor, tree) if not words: continue pattern = {'rule': rule, 'words': words, 'weight': weight} if pattern not in patterns: patterns.append(pattern) words_str = ' '.join([ color.render(x[0],'g') for x in words]) print '(%s) %s' % (entry['sid'], words_str) ## update mongo document if dump: save_extracted_patterns(mco=coDeps, sid=entry['sid'], lemma=target, patterns=patterns)
config.overwrite = True ## select mongo collections co_emotions = db[config.co_emotions_name] co_docs = db[config.co_docs_name] co_pats = db[config.co_pats_name] co_lexicon = db[config.co_lexicon_name] co_patsearch = db[config.co_patsearch_name] # check if fetch source existed config.co_patscore_name = '_'.join( [config.co_patscore_prefix] + config.getOpts(fields=config.opt_fields[config.ps_name], full=False)) co_patscore_existed = config.co_patscore_name in db.collection_names() if not co_patscore_existed: print >> sys.stderr, '(error) source collection', color.render( config.co_patscore_name, 'yellow'), 'is not existed' print >> sys.stderr, '\tcheck the fetch target and run again!!' exit(-1) # check if the destination collection existed config.co_docscore_name = '_'.join( [config.co_docscore_prefix] + config.getOpts(fields=config.opt_fields[config.ds_name], full=False)) co_docscore_existed = config.co_docscore_name in db.collection_names() if co_docscore_existed and not config.overwrite: ## (warning) destination's already existed print >> sys.stderr, '(warning) destination collection', color.render( config.co_docscore_name, 'red'), 'is already existed' print >> sys.stderr, '\t use -o or --overwrite to force update' exit(-1)
# feature_names = {} # clear feature_names for setting_id in setting_ids: ### ======================================= ### check if fetch collection existed ### ======================================= co_feature_name = 'features.'+settings[setting_id]['feature_name'] if settings[setting_id]['feature_name'] == 'position': co_feature_name = 'features.pattern_emotion_position' co_feature_existed = co_feature_name in db.collection_names() if co_feature_existed: co_features[setting_id] = db[co_feature_name] else: print >> sys.stderr, '(error) source collection', color.render(co_feature_name, 'yellow'),'is NOT existed' print >> sys.stderr, '\tcheck the fetch target and run again!!' exit(-1) print >> sys.stderr, '[info] fetching -->', sys.stderr.flush() fusion_id = get_fusion_id(setting_ids) ### ======================================= ## check destination files/folder ### ======================================= new_pathes = check_destination(pathes, token=fusion_id, ext='txt') if not new_pathes: print >> sys.stderr, fusion_id, 'next' continue
elif opt in ('-l', '--limit'): config.min_count = int(arg.strip()) elif opt in ('-v', '--verbose'): config.verbose = True elif opt in ('-o', '--overwrite'): config.overwrite = True ## fetch from collection config.co_docscore_name = '_'.join( [config.co_docscore_prefix] + config.getOpts(fields=config.opt_fields[config.ev_name], full=False)) # if cannot find the fetch target collection co_docscore_existed = config.co_docscore_name in db.collection_names() if not co_docscore_existed: print >> sys.stderr, '(error) collection', color.render( config.co_docscore_name, 'yellow'), 'is not existed' print >> sys.stderr, '\tcheck the fetch target and run again!!' exit(-1) # check if the collection already exists cfg = ','.join( config.getOpts(fields=config.opt_fields[config.ev_name], key_value='=', full=True)) mdoc_results_existed = True if db[config.co_results_name].find_one( {'cfg': cfg}) else False skip_eval = False if not mdoc_results_existed or config.overwrite else True co_docscore = db[config.co_docscore_name] co_results = db[config.co_results_name]
def get_keyword_feature(udocID): keywordFeature = Counter() sents = { x['usentID']: x['sent_length'] for x in list(co_sents.find({'udocID': udocID})) } total_words = sum([sents[x] for x in sents]) th1 = total_words * config.begPercentage / float(100) th2 = total_words * (config.begPercentage + config.midPercentage) / float(100) ## find all words in the document <udocID> words = [] POSs = [] wordIDs = [] sent_mdocs = list(co_sents.find({'udocID': udocID})) for sent_mdoc in sent_mdocs: ## words: list of 'happy' words.extend(sent_mdoc['sent'].split(' ')) ## POSs: list of 'happy/JJ' POSs.extend(sent_mdoc['sent_pos'].split(' ')) ## wordIDs: list of 'word id' wordID_offset = 0 for key in sents: if key < sent_mdoc['usentID']: wordID_offset += sents[key] wordIDs.extend([(x + 1 + wordID_offset) for x in range(sents[sent_mdoc['usentID']])]) if config.verbose: print >> sys.stderr, '\t%s (%d words)\t' % (color.render( '#' + str(udocID), 'y'), len(words)) for idx, word in enumerate(words): word = word.lower() if config.lemma: POS = POSs[idx].split('/').pop() if POS.startswith('N'): pos = 'n' elif POS.startswith('V'): pos = 'v' elif POS.startswith('J'): pos = 'a' elif POS.startswith('R'): pos = 'r' else: pos = None if pos: # only lemmatize certain pos types word = lmtzr.lemmatize(word, pos) if wordIDs[idx] <= th1: position = 'beginning' elif wordIDs[idx] <= th2: position = 'middle' else: position = 'end' count = get_keyword_count(word) if not count: return {} count = remove_self_count(udocID, word, count) percentage = config.cutoffPercentage / float(100) binary_vector = accumulate_threshold(count, percentage) if config.featureValueType == 'b': for emo in binary_vector: key = '@' + position + '_' + emo keywordFeature[key] += binary_vector[emo] ## pattern count (frequency) elif config.featureValueType == 'f': count_vector = { e: count[e] for e in binary_vector if binary_vector[e] == 1 } for emo in count_vector: key = '@' + position + '_' + emo keywordFeature[key] += count_vector[emo] ## keyword score elif config.featureValueType == 's': keyword_score = scoring(count) score_vector = { e: keyword_score[e] for e in binary_vector if binary_vector[e] == 1 } for emo in score_vector: key = '@' + position + '_' + emo keywordFeature[key] += score_vector[emo] else: return False return keywordFeature
def get_keyword_feature(udocID): ## find all words in the document <udocID> words = [] POSs = [] sent_mdocs = list(co_sents.find({'udocID': udocID})) for sent_mdoc in sent_mdocs: ## words: list of 'happy' words.extend(sent_mdoc['sent'].split(' ')) ## POSs: list of 'happy/JJ' POSs.extend(sent_mdoc['sent_pos'].split(' ')) if config.verbose: print >> sys.stderr, '\t%s (%d words)\t' % (color.render( '#' + str(udocID), 'y'), len(words)) ## create keyword features keywordFeature = Counter() for idx, word in enumerate(words): word = word.lower() if config.lemma: POS = POSs[idx].split('/').pop() if POS.startswith('N'): pos = 'n' elif POS.startswith('V'): pos = 'v' elif POS.startswith('J'): pos = 'a' elif POS.startswith('R'): pos = 'r' else: pos = None if pos: # only lemmatize certain pos types word = lmtzr.lemmatize(word, pos) count = get_keyword_count(word) if not count: if config.debug: print 'no count of', word, ', continue to next word.' continue # if no count, skip this word else: count = remove_self_count(udocID, word, count) percentage = config.cutoffPercentage / float(100) binary_vector = accumulate_threshold(count, percentage) if config.featureValueType == 'b': for emo in binary_vector: keywordFeature[emo] += binary_vector[emo] ## pattern count (frequency) elif config.featureValueType == 'f': count_vector = { e: count[e] for e in binary_vector if binary_vector[e] == 1 } for emo in count_vector: keywordFeature[emo] += count_vector[emo] ## keyword score elif config.featureValueType == 's': keyword_score = scoring(count) score_vector = { e: keyword_score[e] for e in binary_vector if binary_vector[e] == 1 } for emo in score_vector: keywordFeature[emo] += score_vector[emo] else: return False # wtf feature type? return keywordFeature
## select collections co_svm_eval = db[config.co_svm_eval_name] co_svm_out = db[config.co_svm_out_name] co_svm_gold = db[config.co_svm_gold_name] ## generate to do list if update_all: to_do_list = find_availale_experiments() else: ## check setting id if not setting_id: print >> sys.stderr, '[error] specify a setting id' exit(-1) else: setting_ids = [setting_id] to_do_list = [(setting_id, param)] for (setting_id, param) in to_do_list: print >> sys.stderr, '[run] processing', color.render( setting_id, 'g'), color.render(param, 'y') eval_mdoc = run(setting_id, param) if intersection: find_intersection(eval_mdoc) if config.verbose: pprint(eval_mdoc)
size = os.stat(abs_path).st_size if fn.endswith('.m'): fns[sid]['model'] = size elif fn.endswith('.gold.txt'): fns[sid]['gold'] = size elif fn.endswith('.train.txt'): fns[sid]['train'] = size elif fn.endswith('.test.txt'): fns[sid]['test'] = size elif fn.endswith('.out'): fns[sid]['out'] = size else: continue for sid in fns: if 0 in fns[sid].values(): status = 'need to check' elif len(fns[sid]) == 3: status = color.render('3/5', 'r') elif len(fns[sid]) == 4: status = color.render('4/5', 'y') elif len(fns[sid]) == 5: status = color.render('all done', 'g') print sid, '(', status, ')' for ftype in fns[sid]: print '\t', ftype, '\t', fns[sid][ftype]
## extract all sentences in one document sents = extract_sents(doc) for sent in sents: ## for each sentence, extract patterns pats = extract_pattern(sent, targets, rule) ## display results if config.verbose: sent_str = ' '.join([ k[0] for k in sorted(set( reduce(lambda x, y: x + y, [((d['x'], d['xIdx']), (d['y'], d['yIdx'])) for d in sent])), key=lambda a: a[1]) ][1:]) print '> %s (%s)' % (sent_str, color.render(str(len(pats)), 'lc')) for p in pats: pat_str = ' '.join([x[0] for x in p['pat']]) print ' ' + color.render(pat_str.lower(), 'g'), round( p['weight'], 2) ## store back in mongo store_mongo(sent, pats, co_pats, topic_or_emotion) print '> %s / %s' % (udocID, MaxudocID) if config.verbose: print '%s end of document %d %s' % ('=' * 20, udocID, '=' * 20)