def generate_falsewords(rel):
    file = open(url + '1')
    file1 = open(url + '/0')
    file2 = open(url + '0+1+2', 'a')
    truekeywords = []
    falsekeywords = []
    for line in file.readlines():
        results = line.split()
        names = [results[1], results[2]]
        news = results[3]
        words = cutnews.cut_news(news, names)
        #去掉不关键属性
        if words:
            for i in range(0, len(words)):
                word = words[i][0]
                flag = words[i][1]
                if flag in ['c', 'v']:
                    if word not in truekeywords:
                        truekeywords.append(word)

    for line in file1.readlines():
        s = ''
        results = line.split()
        names = [results[1], results[2]]
        news = results[3]
        k = 1
        for name in names:
            flags = []
        for word, flag in pesg.cut(name):
            flags.append(flag)
            if flags[0] not in ['nr', 'nrfg']:
                k = 0
            if k == 1:
                words = cutnews.cut_news(news, names)
                #去掉不关键属性
                if words:
                    length = len(words)
                    for i in range(0, length):
                        word = words[i][0]
                        flag = words[i][1]
                        next_flag = ''
                        if i < length - 1:
                            next_flag = words[i + 1][1]
                        if (i == length - 1) & (length - 2 >= 0):
                            next_flag = words[i - 1][1]
                        if flag in ['c', 'v']:
                            if word not in truekeywords + falsekeywords:
                                falsekeywords.append(word)
    falsekeywords = set(sorted(falsekeywords))
    s = ''
    for word in falsekeywords:
        s = s + word + ' '
    file2.write(s)
def generate_falsewords(rel):
    file = open(url + '1')
    file1 = open(url + '/0')
    file2 = open(url + '0+1+2', 'a')
    truekeywords = []
    falsekeywords = []
    for line in file.readlines():
        results = line.split()
        names = [results[1], results[2]]
        news = results[3]
        words = cutnews.cut_news(news, names)
            #去掉不关键属性
        if words:
            for i in range(0, len(words)):
                word = words[i][0]
                flag = words[i][1]
                if flag in ['c', 'v']:
                    if word not in truekeywords:
                        truekeywords.append(word)

    for line in file1.readlines():
        s = ''
        results = line.split()
        names = [results[1], results[2]]
        news = results[3]
        k = 1
        for name in names:
            flags = []
        for word, flag in pesg.cut(name):
            flags.append(flag)
            if flags[0] not in ['nr', 'nrfg']:
                k = 0
            if k == 1:
                words = cutnews.cut_news(news, names)
                    #去掉不关键属性
                if words:
                    length = len(words)
                    for i in range(0, length):
                        word = words[i][0]
                        flag = words[i][1]
                        next_flag = ''
                        if i < length - 1:
                            next_flag = words[i+1][1]
                        if (i == length - 1) & (length - 2 >= 0):
                            next_flag = words[i-1][1]
                        if flag in ['c', 'v']:
                            if word not in truekeywords + falsekeywords:
                                falsekeywords.append(word)
    falsekeywords = set(sorted(falsekeywords))
    s = ''
    for word in falsekeywords:
        s = s + word + ' '
    file2.write(s)
Example #3
0
def match(news, names):
    rels_count = 0
    names_count = 0
    jieba_results = cutnews.cut_news(news, names)
    if jieba_results:
        rels_loc = jieba_results[1] + []
        names_loc = jieba_results[2] + []
        results = jieba_results[0]
        rels_count = len(rels_loc)
        names_loc = len(names_loc)
        if rels_count == 2 & names_count == 2:
            if (((rels_loc[0] > names_loc[0]) & (rels_loc[0] > names_loc[1])) &
                (rels_loc[1] > names_loc[1])) | (
                    (rels_loc[0] < names_loc[0]) &
                    ((rels_loc[1] < names_loc[1]) |
                     (rels_loc[1] > names_loc[0]))) | (
                         (rels_loc[0] < names_loc[0]) &
                         (rels_loc[1] > names_loc[1])):
                return False
            elif rels_loc[0] > names_loc[1]:
                for word, flag in words[rels_loc[0]:rels_loc[1] + 1]:
                    if flag == v_transform:
                        if rels_loc[0] == ['妻子', '女友']:
                            pass
        elif rels_count == 1:
            return model.single_relationship(results, news)
    return False
Example #4
0
def test_mate():
    #比较高传闻不和
    #%50偶像,分手,同居
    #%30同学
    #前女友,前妻,同为校花
    rel_list = [
        '暧昧', '传闻不和', '翻版', '绯闻女友', '分手', '闺蜜', '经纪人', '老师', '老乡', '偶像', '朋友',
        '妻子', '前女友', '前妻', '同居', '同为校花', '昔日情敌', '同学', '撞衫'
    ]
    for r in rel_list:
        #        if r == '绯闻女友':
        k = 0
        line_list = open(
            '/Users/wutong/workspace/da/baidu/data/condensedata/traindatacategory/'
            + r + '/1').readlines()
        for line in line_list:
            results = line.split()
            news = results[3]
            names = [results[1], results[2]]
            words_cut = cutnews.cut_news(news, names)
            if words_cut:
                words_mate = mate_test(words_cut[0], news)
                if words_mate:
                    k = k + 1
        print r, '=', k, ':', len(line_list)
Example #5
0
def condense_database():
    client = MongoClient('localhost', 27017)
    db = client['baidu_revise']
    for i in range(0, 50):
        print i
        db_client = db.names_require[i * 2]
        j = 0
        k = 0
        m = 0
        for cur in db_client.find({}):
            m = m + 1
            names = [
                cur['names'][0].encode('utf-8'),
                cur['names'][1].encode('utf-8')
            ]
            news = cur['news'].encode('utf-8')
            if cutnews.cut_news(news, names):
                j = j + 1
                pass
            else:
                k = k + 1
                db_client.delete_one(cur)
            if m % 500000 == 0:
                print 'm', m
                print 'j', j
                print 'k', k
Example #6
0
def match(news, names):
    rels_count = 0
    names_count = 0
    jieba_results = cutnews.cut_news(news, names)
    if jieba_results:
        rels_loc = jieba_results[1] + []
        names_loc = jieba_results[2] + []
        results = jieba_results[0]
        rels_count = len(rels_loc)
        names_loc = len(names_loc)
        if rels_count == 2 & names_count == 2:
            if (
                (((rels_loc[0] > names_loc[0]) & (rels_loc[0] > names_loc[1])) & (rels_loc[1] > names_loc[1]))
                | ((rels_loc[0] < names_loc[0]) & ((rels_loc[1] < names_loc[1]) | (rels_loc[1] > names_loc[0])))
                | ((rels_loc[0] < names_loc[0]) & (rels_loc[1] > names_loc[1]))
            ):
                return False
            elif rels_loc[0] > names_loc[1]:
                for word, flag in words[rels_loc[0] : rels_loc[1] + 1]:
                    if flag == v_transform:
                        if rels_loc[0] == ["妻子", "女友"]:
                            pass

            elif (rels_loc[0] + 1) != (rels_loc[1] + 0):
                pass
        elif rels_count == 2 & names_count == 3:
            pass
        elif rels_count == 1:
            return model.single_relationship(results, news)
    return False
def condense_key_words_revise(rel):
    client = MongoClient('localhost', 27017)
    db= client['baidu']
    relationship_keywords = db.relationship_keywords1
    file = open('/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt')
    for line in file.readlines():
        items = line.split()
        names = [items[1], items[2]]
        news = items[3]
        rela = items[0]
        if rela == rel:
            results = cutnews.cut_news(news, names)
            for word, flag in results:
                if flag not in ['relationwords', 'namewords']:
                    relationship_keywords.update({ 'name': rel }, {'$addToSet': { 'keywords': word}})
Example #8
0
def condense_category():
    client = MongoClient('localhost', 27017)
    db = client['baidu']
    for rel in relationship:
        u_r = url + rel + '/' + '0'
        u_w = url + rel + '/' + '0_revise_revise'
        file_r = open(u_r)
        file_w = open(u_w, 'a')
        for line in file_r.readlines():
            k = 0
            if len([cur for cur in db.namelist.find({'name': line.split()[1]})]):
                if len([cur for cur in db.namelist.find({'name': line.split()[2]})]):
                    for word, flag in cutnews.cut_news(line.split()[0], [line.split()[1], line.split()[2]]):
                        if flag in ['nr', 'nrfg']:
                            k = k + 1
                    if k < 2:
                        file_w.write(line)
Example #9
0
def test_mate():
#比较高传闻不和
#%50偶像,分手,同居
#%30同学
#前女友,前妻,同为校花
    rel_list = ['暧昧', '传闻不和', '翻版', '绯闻女友', '分手', '闺蜜', '经纪人', '老师', '老乡', '偶像', '朋友', '妻子', '前女友', '前妻', '同居', '同为校花', '昔日情敌', '同学', '撞衫']
    for r in rel_list:
#        if r == '绯闻女友':
        k = 0
        line_list = open('/Users/wutong/workspace/da/baidu/data/condensedata/traindatacategory/' + r + '/1').readlines()
        for line in line_list:
            results = line.split()
            news = results[3]
            names = [results[1], results[2]]
            words_cut = cutnews.cut_news(news, names)
            if words_cut:
                words_mate = mate_test(words_cut[0], news)
                if words_mate:
                    k = k + 1
        print r, '=', k, ':', len(line_list)
def condense_key_words_revise(rel):
    client = MongoClient('localhost', 27017)
    db = client['baidu']
    relationship_keywords = db.relationship_keywords1
    file = open(
        '/Users/wutong/workspace/da/baidu/data/condensedata/condensetraindata.txt'
    )
    for line in file.readlines():
        items = line.split()
        names = [items[1], items[2]]
        news = items[3]
        rela = items[0]
        if rela == rel:
            results = cutnews.cut_news(news, names)
            for word, flag in results:
                if flag not in ['relationwords', 'namewords']:
                    relationship_keywords.update(
                        {'name': rel}, {'$addToSet': {
                            'keywords': word
                        }})
Example #11
0
def condense_database():
    client = MongoClient('localhost', 27017)
    db = client['baidu_revise']
    for i in range(0, 50):
        print i
        db_client = db.names_require[i * 2]
        j = 0
        k = 0
        m = 0
        for cur in db_client.find({}):
            m = m + 1
            names = [cur['names'][0].encode('utf-8'), cur['names'][1].encode('utf-8')]
            news = cur['news'].encode('utf-8')
            if cutnews.cut_news(news, names):
                j = j+1
                pass
            else:
                k = k + 1
                db_client.delete_one(cur)
            if m % 500000 == 0:
                print 'm', m
                print 'j', j
                print 'k', k
def generate_vague_words():
    file = open(url + '2')
    file1 = open(url + '22', 'a')
    org = []
    for line in file.readlines():
        words = line.split()
        names = [words[1], words[2]]
        news = words[3]
        results = cutnews.cut_news(news, names)
        if results:
            org_item = []
            for word, flag in results:
                if flag not in ['relationwords', 'namewords']:
                    org_item.append(word)
                else:
                    org_item.append(flag)
            if org_item not in org:
                org.append(org_item)
    for org_item in org:
        s = ''
        for o in org_item:
            s = s + o + ' '
        file1.write(s + '\n')
Example #13
0
def condense_category():
    client = MongoClient('localhost', 27017)
    db = client['baidu']
    for rel in relationship:
        u_r = url + rel + '/' + '0'
        u_w = url + rel + '/' + '0_revise_revise'
        file_r = open(u_r)
        file_w = open(u_w, 'a')
        for line in file_r.readlines():
            k = 0
            if len(
                [cur for cur in db.namelist.find({'name': line.split()[1]})]):
                if len([
                        cur
                        for cur in db.namelist.find({'name': line.split()[2]})
                ]):
                    for word, flag in cutnews.cut_news(
                            line.split()[0],
                        [line.split()[1], line.split()[2]]):
                        if flag in ['nr', 'nrfg']:
                            k = k + 1
                    if k < 2:
                        file_w.write(line)
def generate_vague_words():
    file = open(url + '2')
    file1 = open(url + '22', 'a')
    org = []
    for line in file.readlines():
        words = line.split()
        names = [words[1], words[2]]
        news = words[3]
        results = cutnews.cut_news(news, names)
        if results:
            org_item = []
            for word, flag in results:
                if flag not in ['relationwords', 'namewords']:
                    org_item.append(word)
                else:
                    org_item.append(flag)
            if org_item not in org:
                org.append(org_item)
    for org_item in org:
        s = ''
        for o in org_item:
            s = s + o + ' '
        file1.write(s + '\n')