def show_confirm(workflow):
	for cmd in workflow:
		print >> sys.stderr, color.render('[cmd]', 'lc'), ' '.join(cmd)
	print >> sys.stderr, color.render('> correct? [Y/n] ', 'g'),
	ok = raw_input()
	print >> sys.stderr
	return ok
def calculate_pattern_scores():
	total = co_lexicon.count()

	for i,mdoc in enumerate(co_lexicon.find()):
		
		pattern = mdoc['pattern']

		percent = (i+1)/float(total)*100

		if not config.verbose:
			sys.stderr.write('[%s> %.2f%%%s]\r' % ('='*(int(percent)+1), percent, ' '*(100-int(percent) ) ) )
			sys.stderr.flush()
		else:
			logging.debug('[%.2f%%] (%d/%d) process %s' % (percent, i+1, total, color.render(pattern, 'ly') ))

		count = get_patcount(pattern)
		logging.debug('get count of "%s (%d)"' % (color.render(pattern,'g'), len(count) ))

		
		pattern_score = {} if not count else feature.pattern_scoring_function(count)

		mdoc = {
			'score':pattern_score,
			'pattern':pattern
		}
		logging.debug('insert mdoc in %s' % (color.render(co_patscore.full_name, 'ly') ) )
		co_patscore.insert(mdoc)

	sys.stderr.write('\n')
	logging.info('create index on %s in %s' % (color.render('pattern', 'g'), color.render(co_patscore.full_name, 'ly') ) )
	co_patscore.create_index("pattern")
def show_confirm(workflow):
    for cmd in workflow:
        print >> sys.stderr, color.render('[cmd]', 'lc'), ' '.join(cmd)
    print >> sys.stderr, color.render('> correct? [Y/n] ', 'g'),
    ok = raw_input()
    print >> sys.stderr
    return ok
def execute_workflow(workflow):
	for cmd in workflow:
		print >> sys.stderr, color.render('['+cmd[0].split('/')[-1]+']', 'r')
		print >> sys.stderr, color.render(' start '+'>'*10, 'y' )
		retcode = subprocess.call(cmd, shell=False)
		print >> sys.stderr, color.render( ' end '+'<'*10, 'b')	
	return True
Example #5
0
def show_most_common(posU, posP, threshold=1.0, max_usage_cnt=10, min_cnt=10 ,min_percent=0.01):
	for anchor in posP:
		print '='*10,anchor,'='*10

		itemP = sorted(posP[anchor].items(), key=lambda x:x[1], reverse=True)
		accum = 0.0
		for (i, (usage, portion)) in enumerate(itemP):
			# if : break
			if i == max_usage_cnt: break
			if portion < min_percent and posU[anchor][usage] < min_cnt: break
			accum += portion

			colorful = []
			for x in usage.split():
				if x == 'PERSON':
					colorful.append(color.render('PERSON', 'g'))
				elif x == 'SOMETHING':
					colorful.append(color.render('SOMETHING', 'r'))
				elif re.match(r'#[^\.]+\.[a-z]', x): # "#familiar.j"
					colorful.append(color.render(x, 'lc'))
				else:
					colorful.append(x)
			colorful_usage = ' '.join(colorful)

			print colorful_usage,'\t', posU[anchor][usage],'\t', round(posP[anchor][usage]*100.0, 4), '%'
			if accum >= threshold:
				break
def execute_workflow(workflow):
    for cmd in workflow:
        print >> sys.stderr, color.render('[' + cmd[0].split('/')[-1] + ']',
                                          'r')
        print >> sys.stderr, color.render(' start ' + '>' * 10, 'y')
        retcode = subprocess.call(cmd, shell=False)
        print >> sys.stderr, color.render(' end ' + '<' * 10, 'b')
    return True
def run(targets_rules):

	udocIDs = co_deps.distinct('udocID')

	MaxudocID = max(udocIDs)
	

	for targets, rule in targets_rules:
		
		rule_str = color.render(' '.join([str(x)+','+str(y) for x,y in rule]), 'lightblue')

		for udocID in udocIDs:

			logging.info(' process %d/%d; rule: %s' % (udocID, MaxudocID, rule_str))

			doc = list(co_deps.find( {'udocID':udocID} ))

			## extract all sentences in one document
			sents = extract_sents(doc)

			for sent in sents:

				## extract patterns in each sentence
				pats = extract_pattern(sent, targets, rule)

				## display results
				if config.verbose:
					sent_str = ' '.join([k[0] for k in sorted(set(reduce(lambda x,y:x+y, [((d['x'],d['xIdx']), (d['y'],d['yIdx'])) for d in sent])), key=lambda a:a[1])][1:] )
					logging.debug('%s (%d)' % (sent_str, len(pats)))
					for p in pats:
						pat_str = ' '.join([x[0] for x in p['pat']])
						logging.debug('   %s %.1f' % (color.render(pat_str.lower(), 'g'), p['weight']))

				## store back in mongo
				for p in pats:
					mdoc = {
						'sent_length': 		sent[0]['sent_length'],
						'udocID': 			sent[0]['udocID'],
						'usentID': 			sent[0]['usentID'],
						config.category: 	sent[0][config.category],

						'pattern_length': 	len(p['pat']),
						'pattern': 			' '.join([x[0] for x in p['pat']]),
						'rule': 			p['matched_rule'],

						'anchor': 			p['anchor'][0],
						'anchor_type': 		p['anchor'][1],
						'anchor_idx': 		p['anchor'][2],

						'weight': 			p['weight']
					}
					co_pats.insert(mdoc)
def setting_prompt():
	for i, setting in enumerate(db[config.co_feature_setting_name].find()):
		if setting['feature_name'] == 'fusion':
			continue
		sid = str(setting['_id'])
		sids_map[i] = sid
		settings[sid] = setting
		sname = setting['feature_name']
		print >> sys.stderr, i,'>', color.render( sid, 'yellow' )
		for x in setting:
			if x == '_id': continue
			print >> sys.stderr, '\t' ,x, ':', color.render( str(setting[x]), 'g' )
		print >> sys.stderr
def check_indexes(check_list, verbose=True):
	res = []
	for co, idx_name in check_list:
		INDEXED = False
		current_idx_full_names = co.index_information().keys()
		for current_idx_full_name in current_idx_full_names:
			current_idx = '_'.join(current_idx_full_name.split('_')[:-1]) 
			if current_idx == idx_name:
				INDEXED = True
				break
		if verbose: logging.info('collection: %s, index: %s (%s)' % (color.render(co.full_name, 'y'), color.render(idx_name,'g'), 'o' if INDEXED else 'x') )
		if not INDEXED:
			co.create_index(idx_name)
			if verbose: logging.warn('create index on %s in %s' % (color.render(idx_name, 'g'), color.render(co.full_name, 'y') ))
def generate_feature_vectors(src_setting_ids):

	global co_feature_setting, eids, udocID_eid

	feature_pool = {}
	feature_vectors = {}

	# for each src_setting_id
	for src_setting_id in src_setting_ids:
		
		# find feature_name --> collection_name
		try:
			feature_name = co_feature_setting.find_one( {'_id': ObjectId(src_setting_id) } )['feature_name']
		except:
			print 'check the format feature setting:',src_setting_id,'in mongodb'
			return False


		collection_name = 'features.' + feature_name

		## use src_setting_id as prefix
		prefix = src_setting_id

		## gathering
		number = db[collection_name].find({'setting':src_setting_id}).count()
		if number == 0: 
			logging.error("can't find any instances with id "+ color.render(src_setting_id, 'y') + ' in ' + color.render(collection_name,'g'))
			return False

		for mdoc in db[collection_name].find({'setting':src_setting_id}):
			udocID = mdoc['udocID']
			emotion = mdoc['emotion']
			
			## use emotion index as eid
			eid = eids[emotion]

			if eid not in feature_vectors:
				feature_vectors[eid] = defaultdict(list)

			## save the mapping of udocID -> eid
			# udocID_gid[udocID] = eid
			feature = mdoc['feature']

			if not feature:
				feature_vectors[eid][udocID] = []
			else:
				for f_name, f_value in feature:

					# combine f_name with prefix
					f_name = '#'.join([prefix, f_name])

					# generate fid
					if f_name not in feature_pool:
						feature_pool[f_name] = len(feature_pool)
					# get fid
					fid = feature_pool[f_name]

					feature_vectors[eid][udocID].append( (fid, f_value) )

	return feature_vectors
Example #11
0
def get_pattern_feature(udocID):

	sents = { x['usentID']:x['sent_length'] for x in list( co_sents.find( {'udocID': udocID} ) ) }
	usentID_offset = min(sents)
	total_words = sum([sents[x] for x in sents])

	th1 = total_words * config.begPercentage/float(100)
	th2 = total_words * (config.begPercentage+config.midPercentage)/float(100)

	patFeature = Counter()

	## find all pats in the document <udocID>
	pats = list( co_pats.find( {'udocID': udocID} ) )

	if config.verbose:
		print >> sys.stderr, '\t%s (%d pats)\t' % (  color.render('#' + str(udocID), 'y'), len(pats))

	for pat in pats:

		if get_total_count(pat['pattern']) >= config.min_count:

			## find pattern position ( beginning/middle/end )
			lanchorID = sum([sents[usentID_offset+i] for i in range(pat['usentID'] - usentID_offset)]) + pat['anchor_idx']
			if lanchorID <= th1: position = 'beginning'
			elif lanchorID <= th2: position = 'middle'
			else: position = 'end'

			key = '@'+ position + '_' + pat['pattern']
			patFeature[ key ] += 1

	return patFeature
def create_keyword_features():

    ## list of emotions
    emotions = [x['emotion'] for x in co_emotions.find({'label': 'LJ40K'})]

    for (ie, gold_emotion) in enumerate(emotions):

        ## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing)
        docs = list(co_docs.find({'emotion': gold_emotion}))

        print >> sys.stderr, '%d > %s ( %d docs )' % (
            ie, color.render(gold_emotion, 'g'), len(docs))

        for doc in docs:
            udocID = doc['udocID']
            kw_feature = get_keyword_feature(udocID).items()
            mdoc = {
                "emotion": gold_emotion,
                "udocID": doc['udocID'],
                "feature": kw_feature,
                "setting": setting_id  # looks like "5369fb11d4388c0aa4c5ca4e"
            }
            co_feature.insert(mdoc)

    co_feature.create_index("setting")
def get_keyword_feature(udocID):

    keywordFeature = Counter()

    ## find all words in the document <udocID>
    words = []
    POSs = []
    sent_mdocs = list(co_sents.find({'udocID': udocID}))
    for sent_mdoc in sent_mdocs:
        words.extend(sent_mdoc['sent'].split(' '))  # words: list of 'happy'
        POSs.extend(
            sent_mdoc['sent_pos'].split(' '))  # POSs: list of 'happy/JJ'

    if config.verbose:
        print >> sys.stderr, '\t%s (%d words)\t' % (color.render(
            '#' + str(udocID), 'y'), len(words))

    for idx, word in enumerate(words):
        word = word.lower()

        if config.lemma:
            POS = POSs[idx].split('/').pop()
            if POS.startswith('J'): pos = 'a'
            elif POS.startswith('V'): pos = 'v'
            elif POS.startswith('R'): pos = 'r'
            else: pos = 'n'
            word = lmtzr.lemmatize(word, pos)

        if word in keyword_list:
            keywordFeature[word] += 1

    return keywordFeature
def get_keyword_feature(udocID):

	keywordFeature = Counter()

	## find all words in the document <udocID>
	words = []
	POSs = []
	sent_mdocs = list( co_sents.find( {'udocID': udocID} ) )
	for sent_mdoc in sent_mdocs:
		words.extend( sent_mdoc['sent'].split(' ') ) # words: list of 'happy'
		POSs.extend( sent_mdoc['sent_pos'].split(' ') ) # POSs: list of 'happy/JJ'

	if config.verbose:
		print >> sys.stderr, '\t%s (%d words)\t' % (  color.render('#' + str(udocID), 'y'), len(words))

	for idx, word in enumerate(words):
		word = word.lower()

		if config.lemma: 
			POS = POSs[idx].split('/').pop()
			if POS.startswith('J'): pos = 'a'
			elif POS.startswith('V'): pos = 'v'
			elif POS.startswith('R'): pos = 'r'
			else: pos = 'n'
			word = lmtzr.lemmatize(word, pos)

		if word in keyword_list:
			keywordFeature[ word ] += 1

	return keywordFeature
def get_document_feature(udocID):

	sents = { x['usentID']:x['sent_length'] for x in list( co_sents.find( {'udocID': udocID} ) ) }
	usentID_offset = min(sents)
	total_words = sum([sents[x] for x in sents])

	th1 = total_words * config.begPercentage/float(100)
	th2 = total_words * (config.begPercentage+config.midPercentage)/float(100)

	# print sents, '\ntotal_words = ', total_words, '\nusentID_offset = ', usentID_offset, '\nth1 = ', th1, '\nth2 = ', th2

	docfeature = Counter()

	## find all pats in the document <udocID>
	pats = list( co_pats.find( {'udocID': udocID} ) )

	if config.verbose:
		print >> sys.stderr, '\t%s (%d pats)\t' % (  color.render('#' + str(udocID), 'y'), len(pats))

	for pat in pats:
		## find pattern position ( beginning/middle/end )
		lanchorID = sum([sents[usentID_offset+i] for i in range(pat['usentID'] - usentID_offset)]) + pat['anchor_idx']
		if lanchorID <= th1: position = 'beginning'
		elif lanchorID <= th2: position = 'middle'
		else: position = 'end'
		# print '='*30, '\n', pat['pattern'], '\n', 'lanchorID = ', lanchorID, '\n', 'position = ', position

		patfeature = get_patfeature(udocID, position, pat['pattern'])


		for e in patfeature: 
			key = '#position'+ '@'+ position + '_' + e
			docfeature[key] += patfeature[e]

	return docfeature
def document_scoring(udocID):
    # find all pats in the document <udocID>
    pats = list(co_pats.find({'udocID': udocID}))

    global search_list

    if config.verbose:
        print >> sys.stderr, '\t%s (%d pats)\t' % (color.render(
            '#' + str(udocID), 'y'), len(pats))

    D = defaultdict(list)

    # calculate the event score in each pattern
    for pat in pats:

        # ignore patterns with occurrence less than x
        # use -l x or --limit x to specify
        if search_list:
            if pat['pattern'] not in search_list:
                continue

        EventScores = event_scoring(pat)
        for emotion in EventScores:
            D[emotion].append(EventScores[emotion])

    scores = dict([(e, sum(D[e]) / float(len(D[e]))) for e in D])

    return scores
def document_scoring(udocID):
	# find all pats in the document <udocID>
	pats = list( co_pats.find( {'udocID': udocID} ) )

	global search_list

	if config.verbose:
		print >> sys.stderr, '\t%s (%d pats)\t' % (  color.render('#' + str(udocID), 'y'), len(pats))

	D = defaultdict(list)
	
	# calculate the event score in each pattern
	for pat in pats:

		# ignore patterns with occurrence less than x
		# use -l x or --limit x to specify
		if search_list:
			if pat['pattern'] not in search_list:
				continue

		EventScores = event_scoring(pat)
		for emotion in EventScores:
			D[emotion].append( EventScores[emotion] )

	scores = dict([(e, sum(D[e])/float(len(D[e])) ) for e in D])

	return scores
def check_indexes(target, indexes, auto=True):
    # |-- idx_name --| |---------   idx_value  --------------|
    # [(u'_id_',       {u'key': [(u'_id', 1)], u'v': 1}),
    #  (u'pattern_1',  {u'key': [(u'pattern', 1.0)], u'v': 1})]
    existed = set()
    for (idx_name, idx_value) in target.index_information().items():
        for idx_str, idx_n in idx_value['key']:
            idx_n = int(idx_n)
            if idx_str in indexes: existed.add(idx_str)

    ## index's not fully-functional
    if len(existed) < len(set(indexes)):
        to_be_created = [x for x in indexes if x not in existed]
        # index_to_be_created = color.render( ', '.join([x for x in to_be_created]), 'green')
        to_be_created_str = color.render(', '.join(to_be_created), 'g')

        if auto:
            print >> sys.stderr, '(warning) missing necessary index(es)', to_be_created_str

            for idx_str in to_be_created:
                print >> sys.stderr, 'automatically creating index', idx_str, '...',
                sys.stderr.flush()
                ## create index in target collection
                target.create_index(idx_str)
                print >> sys.stderr, 'done'

        else:
            print >> sys.stderr, '(error) please manully create index(es)', to_be_created_str, 'first before calculating the score'
            return False

    return True
def check_indexes(target, indexes, auto=True):
	# |-- idx_name --| |---------   idx_value  --------------|
	# [(u'_id_',       {u'key': [(u'_id', 1)], u'v': 1}),
	#  (u'pattern_1',  {u'key': [(u'pattern', 1.0)], u'v': 1})]
	existed = set()
	for (idx_name, idx_value) in target.index_information().items():
		for idx_str, idx_n in idx_value['key']:
			idx_n = int(idx_n)
			if idx_str in indexes: existed.add( idx_str )

	## index's not fully-functional
	if len(existed) < len(set(indexes)):
		to_be_created = [x for x in indexes if x not in existed]
		# index_to_be_created = color.render( ', '.join([x for x in to_be_created]), 'green')
		to_be_created_str = color.render(', '.join(to_be_created), 'g')

		if auto:
			print >> sys.stderr, '(warning) missing necessary index(es)', to_be_created_str

			for idx_str in to_be_created:
				print >> sys.stderr, 'automatically creating index', idx_str, '...',
				sys.stderr.flush()
				## create index in target collection
				target.create_index(idx_str)
				print >> sys.stderr, 'done'

		else:
			print >> sys.stderr, '(error) please manully create index(es)',to_be_created_str,'first before calculating the score'
			return False

	return True
def check_destination(pathes, token, ext='txt'):

	if not os.path.exists(pathes['_root_']):
		os.mkdir(pathes['_root_'])

	new_pathes = {}

	for ftype in pathes:

		if ftype.startswith('_') and ftype.endswith('_'):
			continue

		fn = pathes[ftype]
		## auto-generated filename
		fn = fn if fn else '.'.join([token, ftype, ext])

		## check if destination path already exists
		# join root to yield destination path
		dest_path = os.path.join(pathes['_root_'], fn)
		new_pathes[ftype] = dest_path


		## destination's already existed
		if os.path.exists(dest_path) and not config.overwrite:
			if not fusion_all:
				print >> sys.stderr, '[error] destination file', color.render(dest_path, 'red') ,'is already existed'
				print >> sys.stderr, '        use -o or --overwrite to force overwrite'
				exit(-1)
			else:
				return False

	return new_pathes
Example #21
0
def show(collected):
	for anchor in collected:
		print '='*10, anchor, '='*10
		for (i, (usage, usage_cnt, usage_portion)) in enumerate(collected[anchor]):
			colorful = []
			for x in usage.split():
				if x == 'PERSON':
					colorful.append(color.render('PERSON', 'g'))
				elif x == 'SOMETHING':
					colorful.append(color.render('SOMETHING', 'r'))
				elif re.match(r'#[^\.]+\.[a-z]', x): # "#familiar.j"
					colorful.append(color.render(x, 'lc'))
				else:
					colorful.append(x)
			colorful_usage = ' '.join(colorful)

			print colorful_usage,'\t', usage_cnt,'\t', round(usage_portion*100.0, 4), '%'
def find_intersection(eval_mdoc):

	LJ40K = sorted([x['emotion'] for x in db['emotions'].find({'label':'LJ40K'}) ])
	Mishne05 = sorted([x['emotion'] for x in db['emotions'].find({'label':'Mishne05'}) ])

	inter = []
	for e in set(LJ40K+Mishne05):
		if e in LJ40K and e in Mishne05: inter.append(e)
		
	inter_accuracy = {}
	for e in eval_mdoc['accuracy']:
		if e in inter:
			inter_accuracy[e] = eval_mdoc['accuracy'][e]

	pprint(inter_accuracy)
	print 'avg accuracy in overall\t\t', color.render( str(eval_mdoc['avg_accuracy']), 'g')
	print 'avg accuracy in intersection\t', color.render( str(round( sum(inter_accuracy.values())/float(len(inter_accuracy.values())), 4)), 'y')
def get_keyword_feature(udocID):

    sents = {
        x['usentID']: x['sent_length']
        for x in list(co_sents.find({'udocID': udocID}))
    }
    total_words = sum([sents[x] for x in sents])

    th1 = total_words * config.begPercentage / float(100)
    th2 = total_words * (config.begPercentage +
                         config.midPercentage) / float(100)

    keywordFeature = Counter()

    ## find all words in the document <udocID>
    words = []
    POSs = []
    wordIDs = []
    sent_mdocs = list(co_sents.find({'udocID': udocID}))
    for sent_mdoc in sent_mdocs:

        ## words: list of 'happy'
        words.extend(sent_mdoc['sent'].split(' '))

        ## POSs: list of 'happy/JJ'
        POSs.extend(sent_mdoc['sent_pos'].split(' '))

        ## wordIDs: list of 'word id'
        wordID_offset = 0
        for key in sents:
            if key < sent_mdoc['usentID']: wordID_offset += sents[key]
        wordIDs.extend([(x + 1 + wordID_offset)
                        for x in range(sents[sent_mdoc['usentID']])])

    if config.verbose:
        print >> sys.stderr, '\t%s (%d words)\t' % (color.render(
            '#' + str(udocID), 'y'), len(words))

    for idx, word in enumerate(words):
        word = word.lower()

        if config.lemma:
            POS = POSs[idx].split('/').pop()
            if POS.startswith('J'): pos = 'a'
            elif POS.startswith('V'): pos = 'v'
            elif POS.startswith('R'): pos = 'r'
            else: pos = 'n'
            word = lmtzr.lemmatize(word, pos)

        if wordIDs[idx] <= th1: position = 'beginning'
        elif wordIDs[idx] <= th2: position = 'middle'
        else: position = 'end'

        if word in keyword_list:
            key = '@' + position + '_' + word
            keywordFeature[key] += 1

    return keywordFeature
Example #24
0
def find_intersection(eval_mdoc):

	LJ40K = sorted([x['emotion'] for x in db['emotions'].find({'label':'LJ40K'}) ])
	Mishne05 = sorted([x['emotion'] for x in db['emotions'].find({'label':'Mishne05'}) ])

	inter = []
	for e in set(LJ40K+Mishne05):
		if e in LJ40K and e in Mishne05: inter.append(e)
		
	inter_accuracy = {}
	for e in eval_mdoc['accuracy']:
		if e in inter:
			inter_accuracy[e] = eval_mdoc['accuracy'][e]

	pprint(eval_mdoc['accuracy'])

	print >> sys.stderr, 'avg accuracy in overall\t\t', color.render( str(eval_mdoc['avg_accuracy']), 'g')
	print >> sys.stderr, 'avg accuracy in intersection\t', color.render( str(round( sum(inter_accuracy.values())/float(len(inter_accuracy.values())), 4)), 'y')
def update_all_document_scores():

    global search_list

    search_list = get_search_list()

    emotions = [x['emotion'] for x in co_emotions.find({'label': 'LJ40K'})]

    ## drop docscore collection if overwrite is enabled
    if config.overwrite:
        print >> sys.stderr, 'drop collection', config.co_docscore_name
        co_docscore.drop()

    for (ie, gold_emotion) in enumerate(emotions):

        ## get all document with emotions <gold_emotion> and ldocID is great than 800
        docs = list(
            co_docs.find({
                'emotion': gold_emotion,
                'ldocID': {
                    '$gte': 800
                }
            }))

        if config.verbose:
            print >> sys.stderr, '%d > %s ( %d docs )' % (
                ie, color.render(gold_emotion, 'g'), len(docs))

        else:
            print >> sys.stderr, '%d > %s' % (ie,
                                              color.render(gold_emotion, 'g'))

        for doc in docs:

            # score a document in 40 diff emotions
            scores = document_scoring(doc['udocID'])

            mdoc = {
                'udocID': doc['udocID'],
                'gold_emotion': gold_emotion,
                'scores': scores
            }
            co_docscore.insert(mdoc)
def choose():
	while True:
		print >> sys.stderr,'> choose setting ID(s) [ 0 ~ '+str(len(sids_map)-1)+' ]: ',
		str_sids = raw_input()
		try:
			setting_ids = map(lambda x: sids_map[int(x)], re.findall(r'([0-9]+)\s*,?', str_sids))
			break
		except:
			print >> sys.stderr, color.render( '\n[error] input CORRECT format: e.g., "0,1,2" or "1", and valid index number\n', 'r' )
	print >> sys.stderr
	return setting_ids
def calculate_pattern_scores_remove_self(category):

	## list of category
	categories = [ x[category] for x in co_cate.find( { 'label': category } ) ]
	logging.debug('found %d categories' % len(categories))

	for (ie, gold_category) in enumerate(categories):

		## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing)
		docs = list( co_docs.find( { category: gold_category } ) )
		logging.info('%d/%d %s: %d docs' % ( ie, len(categories), color.render(gold_category, 'lg'), len(docs) ))

		for ith_doc, doc in enumerate(docs):
			udocID = doc['udocID']
			## find all pats in the document <udocID>
			pats = list( co_pats.find( {'udocID': udocID} ) )
			logging.info('%s --> %s (%d pats) [%d/%d]\t%.1f%%' % ( color.render(gold_category, 'lg'), color.render(str(udocID),'ly'), len(pats), ith_doc+1, len(docs), (ith_doc+1)/float(len(docs))*100 )  )

			for pat in pats:

				pattern_score = {}
				pattern = pat['pattern']

				count = get_patcount(pattern)
				logging.debug('get count of "%s (%d)"' % (color.render(pattern,'g'), len(count) ))

				if count:

					count = remove_self_count(udocID, pattern, count, category=config.category)
					logging.debug('remove self count of "%s" in udocID: %s' % (color.render(pattern,'g'), color.render(str(udocID),'lc')) )

					pattern_score = feature.pattern_scoring_function(count)

				mdoc = {
					'score':pattern_score,
					'udocID':udocID,
					'pattern':pattern
				}
				co_patscore.insert(mdoc)

	co_patscore.create_index("pattern")
def get_keyword_feature(udocID):

	sents = { x['usentID']:x['sent_length'] for x in list( co_sents.find( {'udocID': udocID} ) ) }
	total_words = sum([sents[x] for x in sents])

	th1 = total_words * config.begPercentage/float(100)
	th2 = total_words * (config.begPercentage+config.midPercentage)/float(100)

	keywordFeature = Counter()

	## find all words in the document <udocID>
	words = []
	POSs = []
	wordIDs = []
	sent_mdocs = list( co_sents.find( {'udocID': udocID} ) )
	for sent_mdoc in sent_mdocs:
		
		## words: list of 'happy'
		words.extend( sent_mdoc['sent'].split(' ') ) 

		## POSs: list of 'happy/JJ'
		POSs.extend( sent_mdoc['sent_pos'].split(' ') ) 
		
		## wordIDs: list of 'word id' 
		wordID_offset = 0
		for key in sents:
			if key < sent_mdoc['usentID']: wordID_offset += sents[key]
		wordIDs.extend( [ (x+1+wordID_offset) for x in range(sents[ sent_mdoc['usentID'] ]) ] )

	if config.verbose:
		print >> sys.stderr, '\t%s (%d words)\t' % (  color.render('#' + str(udocID), 'y'), len(words))

	for idx, word in enumerate(words):
		word = word.lower()

		if config.lemma: 
			POS = POSs[idx].split('/').pop()
			if POS.startswith('J'): pos = 'a'
			elif POS.startswith('V'): pos = 'v'
			elif POS.startswith('R'): pos = 'r'
			else: pos = 'n'
			word = lmtzr.lemmatize(word, pos)

		if wordIDs[idx] <= th1: position = 'beginning'
		elif wordIDs[idx] <= th2: position = 'middle'
		else: position = 'end'

		if word in keyword_list:
			key = '@'+ position + '_' + word
			keywordFeature[ key ] += 1

	return keywordFeature
def load_data():
	global answers, golds, labels

	paths = search_files()

	if paths:
		print >> sys.stderr, '[path] [confusion_matrix.py] path for out:', paths['out']
		print >> sys.stderr, '[path] [confusion_matrix.py] path for gold:',paths['gold']		
		answers = [line.strip().split('\t')[0] for line in open(paths['out'])]
		golds = [line.strip().split('\t')[0] for line in open(paths['gold'])]
		labels = { line.strip().split('\t')[0]:line.strip().split('\t')[-1] for line in open(paths['gold']) }		
	else:
		print >> sys.stderr, color.render('[erorr] [confusion_matrix.py] cannot find the files.', 'r')
def run():
	global co_feature_setting
	# collection pointer of feature settings
	co_feature_setting = db[config.co_feature_setting_name]

	# sorted src_setting_id
	src_setting_ids = parse_src_setting_ids()
	
	dest_setting_id = obtain_dest_setting_id(src_setting_ids)

	dest_paths = get_dest_paths(dest_setting_id)

	## logging
	logging.debug('src_setting_ids: '+color.render(','.join(src_setting_ids), 'y') )
	for ftype, fn in sorted(dest_paths.items()):
		logging.debug( ftype+': '+color.render(fn, 'g') )
	
	logging.info('dest_setting_id: '+color.render(dest_setting_id, 'y') )


	# files are all existed
	if is_dest_files_exist(dest_paths) and not config.overwrite:
		logging.info('all files are existed')
	# files are not all existed
	else:
		logging.info('generate feature vectors')
		feature_vectors = generate_feature_vectors(src_setting_ids)

		if not feature_vectors:	exit(-1)

		logging.info('transform to svm format')
		str_feature_vectors = tranform_to_svm_format(feature_vectors)

		logging.info('generate train/test files')
		generate_train_test_files(str_feature_vectors, dest_paths)
	
	return True
Example #31
0
def extract(rows, target_postags, target_structures, target_word=None, mongodb=True, VERBOSE=True):



	print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc')
	print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc')
	print '='*60
	collect_cnt, skip_cnt = 0, 0

	for entry in rows:

		## extract rows
		sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep'])
		
		# read dependency and tree objs
		deps = dependency.read(raw_dep, return_type=dict)
		if not deps: continue
		tree = Tree(raw_tree)

		# collect certain dependency relations according to pre-specified pos tags
		## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)]
		cdeps = extract_anchors(deps, tree, targets=target_postags)

		total_word_cnt += len(tree.pos())
		anchor_word_cnt += len(cdeps)

		##  ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...]
		for (word, pos, idx) in cdeps:

			## check if this is the target word if a target specified
			if target_word and word.lower() != target_word.lower():
				if VERBOSE:
					print color.render('(ancher[x]) '+word+'-'+str(idx)+' #'+pos, 'b')
				continue

			## extract dependency relations which match the target structures 
			rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures)

			if rdeps: ## got deps match the target structures

				if VERBOSE:
					print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g')

				T = [ _transform_to_tuple(dep) for dep in rdeps]
				for (rel, (l, li), (r, ri)) in T: print '  ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y')

	print '='*60
def update_all_document_scores():

	global search_list

	search_list = get_search_list()


	emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ]

	## drop docscore collection if overwrite is enabled
	if config.overwrite:
		print >> sys.stderr, 'drop collection', config.co_docscore_name
		co_docscore.drop()


	for (ie, gold_emotion) in enumerate(emotions):

		## get all document with emotions <gold_emotion> and ldocID is great than 800
		docs = list( co_docs.find( { 'emotion': gold_emotion, 'ldocID': {'$gte': 800}} ) )

		if config.verbose:
			print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) )

		else:
			print >> sys.stderr, '%d > %s' % ( ie, color.render(gold_emotion, 'g') )

		for doc in docs:

			# score a document in 40 diff emotions
			scores = document_scoring(doc['udocID'])

			mdoc = { 
				'udocID': doc['udocID'], 
				'gold_emotion': gold_emotion, 
				'scores': scores
			}
			co_docscore.insert( mdoc )
def print_confirm(confirm_msg, bar=40, halt=True):
	for msg in confirm_msg:
		msg = list(msg)
		if len(msg) > 1:
			for i in range(len(msg)-1):
				if type(msg[i+1]) == bool:
					if msg[i+1] == False:
						msg[i+1] = color.render(str(msg[i+1]), color_for[bool][False])
					else:
						msg[i+1] = color.render(str(msg[i+1]),color_for[bool][True])

		if len(msg) == 3 and type(msg[2]) == dict:
			print >> sys.stderr, msg[0], ':', msg[1], msg[2][msg[1]]
		elif len(msg) == 3 and type(msg[2]) == str:
			print >> sys.stderr, msg[0], ':', msg[1], msg[2]
		elif len(msg) == 2:
			print >> sys.stderr, msg[0], ':', msg[1]
		else:
			print >> sys.stderr, msg

	print >> sys.stderr, '='*bar

	if halt:
		print >> sys.stderr, 'press any key to start...', raw_input()	
def print_confirm(confirm_msg, bar=40, halt=True):
	for msg in confirm_msg:
		msg = list(msg)
		if len(msg) > 1:
			for i in range(len(msg)-1):
				if type(msg[i+1]) == bool:
					if msg[i+1] == False:
						msg[i+1] = color.render(str(msg[i+1]), color_for[bool][False])
					else:
						msg[i+1] = color.render(str(msg[i+1]),color_for[bool][True])

		if len(msg) == 3 and type(msg[2]) == dict:
			print >> sys.stderr, msg[0], ':', msg[1], msg[2][msg[1]]
		elif len(msg) == 3 and type(msg[2]) == str:
			print >> sys.stderr, msg[0], ':', msg[1], msg[2]
		elif len(msg) == 2:
			print >> sys.stderr, msg[0], ':', msg[1]
		else:
			print >> sys.stderr, msg

	print >> sys.stderr, '='*bar

	if halt:
		print >> sys.stderr, 'press any key to start...', raw_input()	
def get_pattern_feature(udocID):

	patFeature = Counter()

	## find all pats in the document <udocID>
	pats = list( co_pats.find( {'udocID': udocID} ) )

	if config.verbose:
		print >> sys.stderr, '\t%s (%d pats)\t' % (  color.render('#' + str(udocID), 'y'), len(pats))

	for pat in pats:

		if get_count(pat['pattern']) >= config.min_count:
			patFeature[ pat['pattern'] ] += 1

	return patFeature
Example #36
0
def build_lexicon():

    print 'type: ', wordType
    print 'lemma: ', str(lemma)

    keyword_list = [
        x['word'] for x in list(co_keywords.find({'type': wordType}))
    ]

    keywordCount = defaultdict(Counter)

    for (ie, e) in enumerate(emotions):

        print >> sys.stderr, '%d > %s' % (ie, color.render(e, 'g'))

        for doc in co_docs.find({'emotion': e, 'ldocID': {'$lt': 800}}):

            udocID = doc['udocID']
            mdocs = list(co_sents.find({'udocID': udocID}))

            for mdoc in mdocs:

                words = mdoc['sent'].split(' ')
                POSs = [
                    x.split('/').pop() for x in mdoc['sent_pos'].split(' ')
                ]

                for idx, word in enumerate(words):

                    word = word.lower()

                    if lemma:

                        if POSs[idx].startswith('N'): pos = 'n'
                        elif POSs[idx].startswith('V'): pos = 'v'
                        elif POSs[idx].startswith('J'): pos = 'a'
                        elif POSs[idx].startswith('R'): pos = 'r'
                        else: pos = None
                        if pos:
                            word = lmtzr.lemmatize(word, pos)

                    if word in keyword_list:
                        keywordCount[word][e] += 1

    for word in keywordCount:
        mdoc = {'keyword': word, 'count': keywordCount[word]}
        co_keyword_lexicon.insert(mdoc)
def get_pattern_feature(udocID):

    patFeature = Counter()

    ## find all pats in the document <udocID>
    pats = list(co_pats.find({'udocID': udocID}))

    if config.verbose:
        print >> sys.stderr, '\t%s (%d pats)\t' % (color.render(
            '#' + str(udocID), 'y'), len(pats))

    for pat in pats:

        if get_count(pat['pattern']) >= config.min_count:
            patFeature[pat['pattern']] += 1

    return patFeature
def build_lexicon():

	print 'type: ', wordType
	print 'lemma: ', str(lemma)

	keyword_list = [ x['word'] for x in list( co_keywords.find({ 'type': wordType }) ) ]

	keywordCount = defaultdict(Counter)

	for (ie, e) in enumerate(emotions):

		print >> sys.stderr, '%d > %s' % ( ie, color.render(e, 'g') )

		for doc in co_docs.find( { 'emotion': e, 'ldocID': {'$lt': 800}} ):

			udocID = doc['udocID']
			mdocs = list( co_sents.find( {'udocID': udocID} ) )
			
			for mdoc in mdocs:
			
				words = mdoc['sent'].split(' ')
				POSs = [ x.split('/').pop() for x in mdoc['sent_pos'].split(' ') ]
			
				for idx, word in enumerate(words):
			
					word = word.lower()

					if lemma:
										
						if POSs[idx].startswith('N'): pos = 'n'
						elif POSs[idx].startswith('V'): pos = 'v'
						elif POSs[idx].startswith('J'): pos = 'a'
						elif POSs[idx].startswith('R'): pos = 'r'
						else: pos = None	
						if pos:
							word = lmtzr.lemmatize(word, pos)

					if word in keyword_list:
						keywordCount[word][e] += 1

	for word in keywordCount:
		mdoc = {
			'keyword': word,
			'count': keywordCount[word]
		}
		co_keyword_lexicon.insert( mdoc )
def get_document_feature(udocID):

	docfeature = Counter()

	## find all pats in the document <udocID>
	pats = list( co_pats.find( {'udocID': udocID} ) )

	if config.verbose:
		print >> sys.stderr, '\t%s (%d pats)\t' % (  color.render('#' + str(udocID), 'y'), len(pats))

	for pat in pats:

		patfeature = get_patfeature(pat['pattern'], udocID)

		for e in patfeature: 
			docfeature[e] += patfeature[e]

	return docfeature
def create_keyword_TFIDF_features(setting_id, training_TFIDF, testing_TFIDF):

	## list of emotions
	emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ]

	for (ie, gold_emotion) in enumerate(emotions):

		## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing)
		docs = list( co_docs.find( { 'emotion': gold_emotion } ) )

		if config.verbose:
			print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) )

		for doc in docs:

			udocID = doc['udocID']
			ldocID = doc['ldocID']

			if ldocID < 800: # training
				if udocID in training_TFIDF:
					if keyword_mode:
						feature = [(t, training_TFIDF[udocID][t]) for t in training_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list
					else:
						feature = dict(training_TFIDF[udocID]).items() ## no specified keyword list
				else:
					feature = []
			else:
				if udocID in testing_TFIDF:
					if keyword_mode:
						feature = [(t, testing_TFIDF[udocID][t]) for t in testing_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list
					else:
						feature = dict(testing_TFIDF[udocID]).items()
				else:
					feature = []

			mdoc = {
				"emotion": gold_emotion,
				"udocID": udocID,
				"feature": feature,
				"setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e"
			}
			co_feature.insert(mdoc)

	co_feature.create_index("setting")
Example #41
0
def get_document_feature(udocID):

    docfeature = Counter()

    ## find all pats in the document <udocID>
    pats = list(co_pats.find({'udocID': udocID}))

    if config.verbose:
        print >> sys.stderr, '\t%s (%d pats)\t' % (color.render(
            '#' + str(udocID), 'y'), len(pats))

    for pat in pats:

        patfeature = get_patfeature(pat['pattern'], udocID)

        for e in patfeature:
            docfeature[e] += patfeature[e]

    return docfeature
def run(sid):
	c = Counter()
	root = os.path.join('tmp', sid)
	if not os.path.exists(root): os.makedirs(root)
	
	src_paths = {}
	for ftype in ('train', 'test', 'gold'):
		src_fn = '.'.join([sid,ftype,'txt'])
		src_path = os.path.join('tmp', src_fn)
		if not os.path.exists(src_path):
			print 'missing', src_path, 'run toSVM.py before transforming to binary'
			exit(-1)

		src_paths[ftype] = src_path

	if dest_files_exist(sid, root) and not config.overwrite:
		print 'all destination files existed'
		exit(0)

	## load source files
	data = load_src_files(src_paths)

	## get all labels
	labels = set([x[0] for x in data['train']])

	for anchor in labels: # for each gold label, transform to binary
		
		binary_data = to_binary(data, anchor)

		print 'generating binary data for label', color.render(str(anchor), 'g')

		for ftype in data:

			dest_fn = '.'.join([anchor, 'b', ftype])
			dest_path = os.path.join(root, dest_fn)

			binary_labeled_feature = binary_data[ftype]

			with open(dest_path, 'w') as fw:
				for line_list in binary_labeled_feature:
					line_str = ' '.join(line_list) + '\n'
					fw.write(line_str)
Example #43
0
def run(sid):
    c = Counter()
    root = os.path.join('tmp', sid)
    if not os.path.exists(root): os.makedirs(root)

    src_paths = {}
    for ftype in ('train', 'test', 'gold'):
        src_fn = '.'.join([sid, ftype, 'txt'])
        src_path = os.path.join('tmp', src_fn)
        if not os.path.exists(src_path):
            print 'missing', src_path, 'run toSVM.py before transforming to binary'
            exit(-1)

        src_paths[ftype] = src_path

    if dest_files_exist(sid, root):
        exit(0)

    ## load source files
    data = load_src_files(src_paths)

    ## get all labels
    labels = set([x[0] for x in data['train']])

    for anchor in labels:  # for each gold label, transform to binary

        binary_data = to_binary(data, anchor)

        print 'generating binary data for label', color.render(
            str(anchor), 'g')

        for ftype in data:

            dest_fn = '.'.join([anchor, 'b', ftype])
            dest_path = os.path.join(root, dest_fn)

            binary_labeled_feature = binary_data[ftype]

            with open(dest_path, 'w') as fw:
                for line_list in binary_labeled_feature:
                    line_str = ' '.join(line_list) + '\n'
                    fw.write(line_str)
Example #44
0
def get_document_feature(udocID):

    sents = {
        x['usentID']: x['sent_length']
        for x in list(co_sents.find({'udocID': udocID}))
    }
    usentID_offset = min(sents)
    total_words = sum([sents[x] for x in sents])

    th1 = total_words * config.begPercentage / float(100)
    th2 = total_words * (config.begPercentage +
                         config.midPercentage) / float(100)

    # print sents, '\ntotal_words = ', total_words, '\nusentID_offset = ', usentID_offset, '\nth1 = ', th1, '\nth2 = ', th2

    docfeature = Counter()

    ## find all pats in the document <udocID>
    pats = list(co_pats.find({'udocID': udocID}))

    if config.verbose:
        print >> sys.stderr, '\t%s (%d pats)\t' % (color.render(
            '#' + str(udocID), 'y'), len(pats))

    for pat in pats:
        ## find pattern position ( beginning/middle/end )
        lanchorID = sum([
            sents[usentID_offset + i]
            for i in range(pat['usentID'] - usentID_offset)
        ]) + pat['anchor_idx']
        if lanchorID <= th1: position = 'beginning'
        elif lanchorID <= th2: position = 'middle'
        else: position = 'end'
        # print '='*30, '\n', pat['pattern'], '\n', 'lanchorID = ', lanchorID, '\n', 'position = ', position

        patfeature = get_patfeature(udocID, position, pat['pattern'])

        for e in patfeature:
            key = '#position' + '@' + position + '_' + e
            docfeature[key] += patfeature[e]

    return docfeature
Example #45
0
def create_keyword_TFIDF_features(setting_id, training_TFIDF, testing_TFIDF):

	## list of emotions
	emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ]

	for (ie, gold_emotion) in enumerate(emotions):

		## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing)
		docs = list( co_docs.find( { 'emotion': gold_emotion } ) )

		if config.verbose:
			print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) )

		for doc in docs:

			udocID = doc['udocID']
			ldocID = doc['ldocID']

			if ldocID < 800: # training
				if udocID in training_TFIDF:
					# feature = dict(training_TFIDF[udocID]).items() ## no specified keyword list
					feature = [(t, training_TFIDF[udocID][t]) for t in training_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list
				else:
					feature = []
			else:
				if udocID in testing_TFIDF:
					# feature = dict(testing_TFIDF[udocID]).items()
					feature = [(t, testing_TFIDF[udocID][t]) for t in testing_TFIDF[udocID] if t.lower() in keyword_list] ## use specified keyword list
				else:
					feature = []

			mdoc = {
				"emotion": gold_emotion,
				"udocID": udocID,
				"feature": feature,
				"setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e"
			}
			co_feature.insert(mdoc)

	co_feature.create_index("setting")
def create_keyword_features():

	## list of emotions
	emotions = [ x['emotion'] for x in co_emotions.find( { 'label': 'LJ40K' } ) ]

	for (ie, gold_emotion) in enumerate(emotions):

		## get all document with emotions <gold_emotion> (ldocID: 0-799 for training, 800-999 for testing)
		docs = list( co_docs.find( { 'emotion': gold_emotion } ) )

		print >> sys.stderr, '%d > %s ( %d docs )' % ( ie, color.render(gold_emotion, 'g'), len(docs) )

		for doc in docs:
			mdoc = {
				"emotion": gold_emotion,
				"udocID": doc['udocID'],
				"feature": get_keyword_feature(udocID=doc['udocID']).items(),
				"setting": setting_id # looks like "5369fb11d4388c0aa4c5ca4e"
			}
			co_feature.insert(mdoc)

	co_feature.create_index("setting")
Example #47
0
def extract_and_save(rows, target_postags, target_structures, det_db_cfg, target_word=None, mongodb=True):


	lmtzr = WordNetLemmatizer()


	print 'anchor pos tags:', color.render(', '.join(target_postags), 'lc')
	print 'structures:', color.render(', '.join([x[0]+':'+str(x[1]) for x in target_structures]), 'lc')
	print '='*60
	collect_cnt, skip_cnt = 0, 0	

	mc = pymongo.Connection(det_db_cfg['server_addr'])
	db = mc[det_db_cfg['db']]
	co = db[det_db_cfg['collection']]

	sent_cnt, total_word_cnt, anchor_word_cnt, anchor_word_structure_cnt = 0, 0, 0, 0


	for entry in rows:

		## extract rows
		sid, sent, pos, raw_tree, raw_dep = entry if not mongodb else (entry['id'], entry['sent'], entry['pos'], entry['tree'], entry['dep'])
		
		# read dependency and tree objs
		deps = dependency.read(raw_dep, return_type=dict)
		if not deps: continue
		tree = Tree(raw_tree)


		# collect certain dependency relations according to pre-specified pos tags
		## cdeps: [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7), (u'Yeah', u'JJ', 1), (u'well', u'NN', 2), (u'gotta', u'NN', 6), (u'bowl', u'NN', 11), (u'vinegar', u'NN', 13), (u'put', u'VBN', 9)]
		cdeps = extract_anchors(deps, tree, targets=target_postags)

		## for stat
		sent_cnt += 1
		total_word_cnt += len(tree.pos())
		anchor_word_cnt += len(cdeps)

		##  ('is', 'VBZ', 8) in [(u'is', u'VBZ', 8), (u"'ve", u'VBP', 5), (u'do', u'VBP', 7) ...]
		for (word, pos, idx) in cdeps:

			## check if this is the target word if a target specified
			if target_word and word.lower() != target_word.lower(): continue

			## extract dependency relations which match the target structures 
			rdeps = _filter_deps_by_rel(deps, anchor=(word, idx), targets=target_structures)

			if rdeps: ## got deps match the target structures

				print color.render('(anchor[v]) '+word+'-'+str(idx)+' #'+pos, 'g')

				T = [ _transform_to_tuple(dep) for dep in rdeps]
				for (rel, (l, li), (r, ri)) in T: print '  ',color.render(rel,'r'),color.render('( '+l+'-'+str(li)+', '+r+'-'+str(ri)+' )','y')

				lemma = lmtzr.lemmatize(word, _getWordNetPOS(pos))

				# generate mongo obj
				mongo_obj = {}
				mongo_obj['sid'] = sid 		# sentence id
				mongo_obj['word'] = word 	# anchor word
				mongo_obj['pos'] = pos 		# pos tag of word
				mongo_obj['idx'] = idx 		# word index 
				mongo_obj['deps'] = rdeps	# related deps
				mongo_obj['lemma'] = lemma	# word lemma
				
				co.insert(mongo_obj)

				anchor_word_structure_cnt += 1

	
	mc.close()

	print '='*60
	print 'write statistic log'
	with open('stat.log','w') as fw:
		fw.write('total sent'+'\t'+str(sent_cnt)+'\n')
		fw.write('total word'+'\t'+str(total_word_cnt)+'\n')
		fw.write('anchor word'+'\t'+str(anchor_word_cnt)+'\n')
		fw.write('anchor word with structures'+'\t'+str(anchor_word_structure_cnt)+'\n')
Example #48
0
	print '='*60

if __name__ == '__main__':

	######## sqlite version ########
	# db_path = 'data/bnc.db3'
	# sql = "select * from BNC_Parsed where sent like ?"
	# args = ['%'+'interested'+'%']
	# rows = fetch_sqlite(db_path, sql, args)

	######## mongo version ########
	doraemon = 'doraemon.iis.sinica.edu.tw'
	db_info = {'name': 'BNC', 'collection': 'Parsed'}

	# connect to mongo server
	print >> sys.stderr, color.render('fetching data','r'), '...',
	sys.stderr.flush()
	cur = fetch_mongo(doraemon, db_info, None)
	print >> sys.stderr, color.render('done','g')

	# get fetched data
	# print >> sys.stderr, color.render('limiting data','r'), '...',
	# sys.stderr.flush()
	# rows = cur.limit(1000)
	# print >> sys.stderr, color.render('done','g')

	## pre-specified target pos tags
	target_postags = ['JJ', 'VB', 'NN']

	## pre-specified structures
	## 1: necessary
Example #49
0
def main(argv, halt=False):

	# default value
	target = 'familiar'
	rule = [('subj', 1), ('cop', 1), ('prep', 1)]
	limit = -1
	dump = False

	var = _extract_opt(argv)
	target = target if not var['target'] else var['target'].strip()
	rule = rule if not var['rule'] else eval(var['rule'])
	limit = limit if not var['limit'] else int(var['limit'])
	dump = dump if not var['dump'] else var['dump']

	print >> sys.stderr, color.render("target:",'lc'),target
	print >> sys.stderr, color.render("rule:",'lc'),rule
	print >> sys.stderr, color.render("limit:",'lc'),limit
	print >> sys.stderr, color.render("dump:",'lc'),dump

	if halt:
		print >> sys.stderr, 'press to begin ...',raw_input()
	

	## ------------------------------ main program ------------------------------

	R = coDeps.find({'lemma': target}) if limit < 0 else coDeps.find({'lemma': target}).limit(limit)
	
	for entry in R:

		# get dependency relations 
		deps = entry['deps']

		# fetch original sentence info (including raw tree) to obtain pos tags
		raw = list(coParsed.find( {'id':entry['sid']} ))[0]

		tree = Tree(raw['tree'])

		# filter deps by pre-defined rule
		# and yield a dictionary with rel<str> as key, deps<list> as value
		rels = apply_rule(deps, rule)

		if not rels: continue

		combs = ListCombination(rels.values())
		
		# calculate weight of each combination
		weight = 1/float(len(combs)) if len(combs) > 1 else 1.0

		# form the anchor element using (word, index pair)
		anchor = (entry['word'], entry['idx'])

		# collect existing patterns object, ready to append new found patterns
		patterns = [] if 'patterns' not in entry else entry['patterns'] 

		# print 'sid >',

		for comb in combs:

			words = form(comb, anchor, tree)
			if not words: continue
			pattern = {'rule': rule, 'words': words, 'weight': weight}

			if pattern not in patterns:
				patterns.append(pattern)

			words_str = ' '.join([ color.render(x[0],'g') for x in words])
			print '(%s) %s' % (entry['sid'], words_str)
		
		## update mongo document
		if dump:
			save_extracted_patterns(mco=coDeps, sid=entry['sid'], lemma=target, patterns=patterns)
            config.overwrite = True

    ## select mongo collections
    co_emotions = db[config.co_emotions_name]
    co_docs = db[config.co_docs_name]
    co_pats = db[config.co_pats_name]
    co_lexicon = db[config.co_lexicon_name]
    co_patsearch = db[config.co_patsearch_name]

    # check if fetch source existed
    config.co_patscore_name = '_'.join(
        [config.co_patscore_prefix] +
        config.getOpts(fields=config.opt_fields[config.ps_name], full=False))
    co_patscore_existed = config.co_patscore_name in db.collection_names()
    if not co_patscore_existed:
        print >> sys.stderr, '(error) source collection', color.render(
            config.co_patscore_name, 'yellow'), 'is not existed'
        print >> sys.stderr, '\tcheck the fetch target and run again!!'
        exit(-1)

    # check if the destination collection existed
    config.co_docscore_name = '_'.join(
        [config.co_docscore_prefix] +
        config.getOpts(fields=config.opt_fields[config.ds_name], full=False))
    co_docscore_existed = config.co_docscore_name in db.collection_names()
    if co_docscore_existed and not config.overwrite:
        ## (warning) destination's already existed
        print >> sys.stderr, '(warning) destination collection', color.render(
            config.co_docscore_name, 'red'), 'is already existed'
        print >> sys.stderr, '\t  use -o or --overwrite to force update'
        exit(-1)
			# feature_names = {} # clear feature_names

			for setting_id in setting_ids:
				### =======================================
				### check if fetch collection existed
				### =======================================
				
				co_feature_name = 'features.'+settings[setting_id]['feature_name']
				if settings[setting_id]['feature_name'] == 'position':
					co_feature_name = 'features.pattern_emotion_position'
				co_feature_existed = co_feature_name in db.collection_names()
				if co_feature_existed:

					co_features[setting_id] = db[co_feature_name]
				else:
					print >> sys.stderr, '(error) source collection', color.render(co_feature_name, 'yellow'),'is NOT existed'
					print >> sys.stderr, '\tcheck the fetch target and run again!!'
					exit(-1)

			print >> sys.stderr, '[info] fetching -->',
			sys.stderr.flush()
			fusion_id = get_fusion_id(setting_ids)

			### =======================================
			## check destination files/folder
			### =======================================
			new_pathes = check_destination(pathes, token=fusion_id, ext='txt')
			
			if not new_pathes:
				print >> sys.stderr, fusion_id, 'next'
				continue
Example #52
0
        elif opt in ('-l', '--limit'):
            config.min_count = int(arg.strip())
        elif opt in ('-v', '--verbose'):
            config.verbose = True
        elif opt in ('-o', '--overwrite'):
            config.overwrite = True

    ## fetch from collection
    config.co_docscore_name = '_'.join(
        [config.co_docscore_prefix] +
        config.getOpts(fields=config.opt_fields[config.ev_name], full=False))

    # if cannot find the fetch target collection
    co_docscore_existed = config.co_docscore_name in db.collection_names()
    if not co_docscore_existed:
        print >> sys.stderr, '(error) collection', color.render(
            config.co_docscore_name, 'yellow'), 'is not existed'
        print >> sys.stderr, '\tcheck the fetch target and run again!!'
        exit(-1)

    # check if the collection already exists
    cfg = ','.join(
        config.getOpts(fields=config.opt_fields[config.ev_name],
                       key_value='=',
                       full=True))
    mdoc_results_existed = True if db[config.co_results_name].find_one(
        {'cfg': cfg}) else False
    skip_eval = False if not mdoc_results_existed or config.overwrite else True

    co_docscore = db[config.co_docscore_name]
    co_results = db[config.co_results_name]
def get_keyword_feature(udocID):

    keywordFeature = Counter()

    sents = {
        x['usentID']: x['sent_length']
        for x in list(co_sents.find({'udocID': udocID}))
    }
    total_words = sum([sents[x] for x in sents])

    th1 = total_words * config.begPercentage / float(100)
    th2 = total_words * (config.begPercentage +
                         config.midPercentage) / float(100)

    ## find all words in the document <udocID>
    words = []
    POSs = []
    wordIDs = []
    sent_mdocs = list(co_sents.find({'udocID': udocID}))
    for sent_mdoc in sent_mdocs:

        ## words: list of 'happy'
        words.extend(sent_mdoc['sent'].split(' '))

        ## POSs: list of 'happy/JJ'
        POSs.extend(sent_mdoc['sent_pos'].split(' '))

        ## wordIDs: list of 'word id'
        wordID_offset = 0
        for key in sents:
            if key < sent_mdoc['usentID']: wordID_offset += sents[key]
        wordIDs.extend([(x + 1 + wordID_offset)
                        for x in range(sents[sent_mdoc['usentID']])])

    if config.verbose:
        print >> sys.stderr, '\t%s (%d words)\t' % (color.render(
            '#' + str(udocID), 'y'), len(words))

    for idx, word in enumerate(words):
        word = word.lower()

        if config.lemma:
            POS = POSs[idx].split('/').pop()
            if POS.startswith('N'): pos = 'n'
            elif POS.startswith('V'): pos = 'v'
            elif POS.startswith('J'): pos = 'a'
            elif POS.startswith('R'): pos = 'r'
            else: pos = None
            if pos:  # only lemmatize certain pos types
                word = lmtzr.lemmatize(word, pos)

        if wordIDs[idx] <= th1: position = 'beginning'
        elif wordIDs[idx] <= th2: position = 'middle'
        else: position = 'end'

        count = get_keyword_count(word)
        if not count: return {}
        count = remove_self_count(udocID, word, count)

        percentage = config.cutoffPercentage / float(100)
        binary_vector = accumulate_threshold(count, percentage)

        if config.featureValueType == 'b':
            for emo in binary_vector:
                key = '@' + position + '_' + emo
                keywordFeature[key] += binary_vector[emo]

        ## pattern count (frequency)
        elif config.featureValueType == 'f':
            count_vector = {
                e: count[e]
                for e in binary_vector if binary_vector[e] == 1
            }
            for emo in count_vector:
                key = '@' + position + '_' + emo
                keywordFeature[key] += count_vector[emo]

        ## keyword score
        elif config.featureValueType == 's':
            keyword_score = scoring(count)
            score_vector = {
                e: keyword_score[e]
                for e in binary_vector if binary_vector[e] == 1
            }
            for emo in score_vector:
                key = '@' + position + '_' + emo
                keywordFeature[key] += score_vector[emo]

        else:
            return False

    return keywordFeature
def get_keyword_feature(udocID):

    ## find all words in the document <udocID>
    words = []
    POSs = []
    sent_mdocs = list(co_sents.find({'udocID': udocID}))
    for sent_mdoc in sent_mdocs:

        ## words: list of 'happy'
        words.extend(sent_mdoc['sent'].split(' '))

        ## POSs: list of 'happy/JJ'
        POSs.extend(sent_mdoc['sent_pos'].split(' '))

    if config.verbose:
        print >> sys.stderr, '\t%s (%d words)\t' % (color.render(
            '#' + str(udocID), 'y'), len(words))

    ## create keyword features
    keywordFeature = Counter()
    for idx, word in enumerate(words):
        word = word.lower()

        if config.lemma:
            POS = POSs[idx].split('/').pop()
            if POS.startswith('N'): pos = 'n'
            elif POS.startswith('V'): pos = 'v'
            elif POS.startswith('J'): pos = 'a'
            elif POS.startswith('R'): pos = 'r'
            else: pos = None
            if pos:  # only lemmatize certain pos types
                word = lmtzr.lemmatize(word, pos)

        count = get_keyword_count(word)

        if not count:
            if config.debug:
                print 'no count of', word, ', continue to next word.'
            continue  # if no count, skip this word
        else:
            count = remove_self_count(udocID, word, count)

            percentage = config.cutoffPercentage / float(100)

            binary_vector = accumulate_threshold(count, percentage)

            if config.featureValueType == 'b':
                for emo in binary_vector:
                    keywordFeature[emo] += binary_vector[emo]

            ## pattern count (frequency)
            elif config.featureValueType == 'f':
                count_vector = {
                    e: count[e]
                    for e in binary_vector if binary_vector[e] == 1
                }
                for emo in count_vector:
                    keywordFeature[emo] += count_vector[emo]
            ## keyword score
            elif config.featureValueType == 's':
                keyword_score = scoring(count)
                score_vector = {
                    e: keyword_score[e]
                    for e in binary_vector if binary_vector[e] == 1
                }
                for emo in score_vector:
                    keywordFeature[emo] += score_vector[emo]
            else:
                return False  # wtf feature type?

    return keywordFeature
Example #55
0
    ## select collections
    co_svm_eval = db[config.co_svm_eval_name]
    co_svm_out = db[config.co_svm_out_name]
    co_svm_gold = db[config.co_svm_gold_name]

    ## generate to do list
    if update_all:
        to_do_list = find_availale_experiments()
    else:
        ## check setting id
        if not setting_id:
            print >> sys.stderr, '[error] specify a setting id'
            exit(-1)
        else:
            setting_ids = [setting_id]

        to_do_list = [(setting_id, param)]

    for (setting_id, param) in to_do_list:

        print >> sys.stderr, '[run] processing', color.render(
            setting_id, 'g'), color.render(param, 'y')
        eval_mdoc = run(setting_id, param)

        if intersection:
            find_intersection(eval_mdoc)

        if config.verbose:
            pprint(eval_mdoc)
Example #56
0
        size = os.stat(abs_path).st_size

        if fn.endswith('.m'):
            fns[sid]['model'] = size
        elif fn.endswith('.gold.txt'):
            fns[sid]['gold'] = size
        elif fn.endswith('.train.txt'):
            fns[sid]['train'] = size
        elif fn.endswith('.test.txt'):
            fns[sid]['test'] = size
        elif fn.endswith('.out'):
            fns[sid]['out'] = size
    else:
        continue

for sid in fns:

    if 0 in fns[sid].values():
        status = 'need to check'
    elif len(fns[sid]) == 3:
        status = color.render('3/5', 'r')
    elif len(fns[sid]) == 4:
        status = color.render('4/5', 'y')
    elif len(fns[sid]) == 5:
        status = color.render('all done', 'g')

    print sid, '(', status, ')'

    for ftype in fns[sid]:
        print '\t', ftype, '\t', fns[sid][ftype]
            ## extract all sentences in one document
            sents = extract_sents(doc)

            for sent in sents:

                ## for each sentence, extract patterns
                pats = extract_pattern(sent, targets, rule)

                ## display results
                if config.verbose:
                    sent_str = ' '.join([
                        k[0] for k in sorted(set(
                            reduce(lambda x, y: x + y, [((d['x'], d['xIdx']),
                                                         (d['y'], d['yIdx']))
                                                        for d in sent])),
                                             key=lambda a: a[1])
                    ][1:])
                    print '> %s (%s)' % (sent_str,
                                         color.render(str(len(pats)), 'lc'))
                    for p in pats:
                        pat_str = ' '.join([x[0] for x in p['pat']])
                        print '  ' + color.render(pat_str.lower(), 'g'), round(
                            p['weight'], 2)

                ## store back in mongo
                store_mongo(sent, pats, co_pats, topic_or_emotion)

            print '> %s / %s' % (udocID, MaxudocID)
            if config.verbose:
                print '%s end of document %d %s' % ('=' * 20, udocID, '=' * 20)