def generate_falsewords(rel): file = open(url + '1') file1 = open(url + '/0') file2 = open(url + '0+1+2', 'a') truekeywords = [] falsekeywords = [] for line in file.readlines(): results = line.split() names = [results[1], results[2]] news = results[3] words = cutnews.cut_news(news, names) #去掉不关键属性 if words: for i in range(0, len(words)): word = words[i][0] flag = words[i][1] if flag in ['c', 'v']: if word not in truekeywords: truekeywords.append(word) for line in file1.readlines(): s = '' results = line.split() names = [results[1], results[2]] news = results[3] k = 1 for name in names: flags = [] for word, flag in pesg.cut(name): flags.append(flag) if flags[0] not in ['nr', 'nrfg']: k = 0 if k == 1: words = cutnews.cut_news(news, names) #去掉不关键属性 if words: length = len(words) for i in range(0, length): word = words[i][0] flag = words[i][1] next_flag = '' if i < length - 1: next_flag = words[i + 1][1] if (i == length - 1) & (length - 2 >= 0): next_flag = words[i - 1][1] if flag in ['c', 'v']: if word not in truekeywords + falsekeywords: falsekeywords.append(word) falsekeywords = set(sorted(falsekeywords)) s = '' for word in falsekeywords: s = s + word + ' ' file2.write(s)
def generate_falsewords(rel): file = open(url + '1') file1 = open(url + '/0') file2 = open(url + '0+1+2', 'a') truekeywords = [] falsekeywords = [] for line in file.readlines(): results = line.split() names = [results[1], results[2]] news = results[3] words = cutnews.cut_news(news, names) #去掉不关键属性 if words: for i in range(0, len(words)): word = words[i][0] flag = words[i][1] if flag in ['c', 'v']: if word not in truekeywords: truekeywords.append(word) for line in file1.readlines(): s = '' results = line.split() names = [results[1], results[2]] news = results[3] k = 1 for name in names: flags = [] for word, flag in pesg.cut(name): flags.append(flag) if flags[0] not in ['nr', 'nrfg']: k = 0 if k == 1: words = cutnews.cut_news(news, names) #去掉不关键属性 if words: length = len(words) for i in range(0, length): word = words[i][0] flag = words[i][1] next_flag = '' if i < length - 1: next_flag = words[i+1][1] if (i == length - 1) & (length - 2 >= 0): next_flag = words[i-1][1] if flag in ['c', 'v']: if word not in truekeywords + falsekeywords: falsekeywords.append(word) falsekeywords = set(sorted(falsekeywords)) s = '' for word in falsekeywords: s = s + word + ' ' file2.write(s)
def match(news, names): rels_count = 0 names_count = 0 jieba_results = cutnews.cut_news(news, names) if jieba_results: rels_loc = jieba_results[1] + [] names_loc = jieba_results[2] + [] results = jieba_results[0] rels_count = len(rels_loc) names_loc = len(names_loc) if rels_count == 2 & names_count == 2: if (((rels_loc[0] > names_loc[0]) & (rels_loc[0] > names_loc[1])) & (rels_loc[1] > names_loc[1])) | ( (rels_loc[0] < names_loc[0]) & ((rels_loc[1] < names_loc[1]) | (rels_loc[1] > names_loc[0]))) | ( (rels_loc[0] < names_loc[0]) & (rels_loc[1] > names_loc[1])): return False elif rels_loc[0] > names_loc[1]: for word, flag in words[rels_loc[0]:rels_loc[1] + 1]: if flag == v_transform: if rels_loc[0] == ['妻子', '女友']: pass elif rels_count == 1: return model.single_relationship(results, news) return False
def test_mate(): #比较高传闻不和 #%50偶像,分手,同居 #%30同学 #前女友,前妻,同为校花 rel_list = [ '暧昧', '传闻不和', '翻版', '绯闻女友', '分手', '闺蜜', '经纪人', '老师', '老乡', '偶像', '朋友', '妻子', '前女友', '前妻', '同居', '同为校花', '昔日情敌', '同学', '撞衫' ] for r in rel_list: # if r == '绯闻女友': k = 0 line_list = open( '/Users/wutong/workspace/da/baidu/data/condensedata/traindatacategory/' + r + '/1').readlines() for line in line_list: results = line.split() news = results[3] names = [results[1], results[2]] words_cut = cutnews.cut_news(news, names) if words_cut: words_mate = mate_test(words_cut[0], news) if words_mate: k = k + 1 print r, '=', k, ':', len(line_list)
def condense_database(): client = MongoClient('localhost', 27017) db = client['baidu_revise'] for i in range(0, 50): print i db_client = db.names_require[i * 2] j = 0 k = 0 m = 0 for cur in db_client.find({}): m = m + 1 names = [ cur['names'][0].encode('utf-8'), cur['names'][1].encode('utf-8') ] news = cur['news'].encode('utf-8') if cutnews.cut_news(news, names): j = j + 1 pass else: k = k + 1 db_client.delete_one(cur) if m % 500000 == 0: print 'm', m print 'j', j print 'k', k
def match(news, names): rels_count = 0 names_count = 0 jieba_results = cutnews.cut_news(news, names) if jieba_results: rels_loc = jieba_results[1] + [] names_loc = jieba_results[2] + [] results = jieba_results[0] rels_count = len(rels_loc) names_loc = len(names_loc) if rels_count == 2 & names_count == 2: if ( (((rels_loc[0] > names_loc[0]) & (rels_loc[0] > names_loc[1])) & (rels_loc[1] > names_loc[1])) | ((rels_loc[0] < names_loc[0]) & ((rels_loc[1] < names_loc[1]) | (rels_loc[1] > names_loc[0]))) | ((rels_loc[0] < names_loc[0]) & (rels_loc[1] > names_loc[1])) ): return False elif rels_loc[0] > names_loc[1]: for word, flag in words[rels_loc[0] : rels_loc[1] + 1]: if flag == v_transform: if rels_loc[0] == ["妻子", "女友"]: pass elif (rels_loc[0] + 1) != (rels_loc[1] + 0): pass elif rels_count == 2 & names_count == 3: pass elif rels_count == 1: return model.single_relationship(results, news) return False
def condense_key_words_revise(rel): client = MongoClient('localhost', 27017) db= client['baidu'] relationship_keywords = db.relationship_keywords1 file = open('/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt') for line in file.readlines(): items = line.split() names = [items[1], items[2]] news = items[3] rela = items[0] if rela == rel: results = cutnews.cut_news(news, names) for word, flag in results: if flag not in ['relationwords', 'namewords']: relationship_keywords.update({ 'name': rel }, {'$addToSet': { 'keywords': word}})
def condense_category(): client = MongoClient('localhost', 27017) db = client['baidu'] for rel in relationship: u_r = url + rel + '/' + '0' u_w = url + rel + '/' + '0_revise_revise' file_r = open(u_r) file_w = open(u_w, 'a') for line in file_r.readlines(): k = 0 if len([cur for cur in db.namelist.find({'name': line.split()[1]})]): if len([cur for cur in db.namelist.find({'name': line.split()[2]})]): for word, flag in cutnews.cut_news(line.split()[0], [line.split()[1], line.split()[2]]): if flag in ['nr', 'nrfg']: k = k + 1 if k < 2: file_w.write(line)
def test_mate(): #比较高传闻不和 #%50偶像,分手,同居 #%30同学 #前女友,前妻,同为校花 rel_list = ['暧昧', '传闻不和', '翻版', '绯闻女友', '分手', '闺蜜', '经纪人', '老师', '老乡', '偶像', '朋友', '妻子', '前女友', '前妻', '同居', '同为校花', '昔日情敌', '同学', '撞衫'] for r in rel_list: # if r == '绯闻女友': k = 0 line_list = open('/Users/wutong/workspace/da/baidu/data/condensedata/traindatacategory/' + r + '/1').readlines() for line in line_list: results = line.split() news = results[3] names = [results[1], results[2]] words_cut = cutnews.cut_news(news, names) if words_cut: words_mate = mate_test(words_cut[0], news) if words_mate: k = k + 1 print r, '=', k, ':', len(line_list)
def condense_key_words_revise(rel): client = MongoClient('localhost', 27017) db = client['baidu'] relationship_keywords = db.relationship_keywords1 file = open( '/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt' ) for line in file.readlines(): items = line.split() names = [items[1], items[2]] news = items[3] rela = items[0] if rela == rel: results = cutnews.cut_news(news, names) for word, flag in results: if flag not in ['relationwords', 'namewords']: relationship_keywords.update( {'name': rel}, {'$addToSet': { 'keywords': word }})
def condense_database(): client = MongoClient('localhost', 27017) db = client['baidu_revise'] for i in range(0, 50): print i db_client = db.names_require[i * 2] j = 0 k = 0 m = 0 for cur in db_client.find({}): m = m + 1 names = [cur['names'][0].encode('utf-8'), cur['names'][1].encode('utf-8')] news = cur['news'].encode('utf-8') if cutnews.cut_news(news, names): j = j+1 pass else: k = k + 1 db_client.delete_one(cur) if m % 500000 == 0: print 'm', m print 'j', j print 'k', k
def generate_vague_words(): file = open(url + '2') file1 = open(url + '22', 'a') org = [] for line in file.readlines(): words = line.split() names = [words[1], words[2]] news = words[3] results = cutnews.cut_news(news, names) if results: org_item = [] for word, flag in results: if flag not in ['relationwords', 'namewords']: org_item.append(word) else: org_item.append(flag) if org_item not in org: org.append(org_item) for org_item in org: s = '' for o in org_item: s = s + o + ' ' file1.write(s + '\n')
def condense_category(): client = MongoClient('localhost', 27017) db = client['baidu'] for rel in relationship: u_r = url + rel + '/' + '0' u_w = url + rel + '/' + '0_revise_revise' file_r = open(u_r) file_w = open(u_w, 'a') for line in file_r.readlines(): k = 0 if len( [cur for cur in db.namelist.find({'name': line.split()[1]})]): if len([ cur for cur in db.namelist.find({'name': line.split()[2]}) ]): for word, flag in cutnews.cut_news( line.split()[0], [line.split()[1], line.split()[2]]): if flag in ['nr', 'nrfg']: k = k + 1 if k < 2: file_w.write(line)