コード例 #1
0
def getAnswerKeys(text_set, api_key):
    keys = []
    nlp = BosonNLP(api_key)
    result = nlp.extract_keywords(text_set, top_k=30)
    for weight, word in result:
        keys.append((word, int(weight * 10)))
    return keys
コード例 #2
0
ファイル: zhihu-keywords.py プロジェクト: skyam/py3-utils
def getAnswerKeys (text_set, api_key):
	keys = []
	nlp = BosonNLP(api_key)
	result = nlp.extract_keywords(text_set, top_k=30)
	for weight, word in result:
		keys.append((word, int(weight*10)))
	return keys
コード例 #3
0
def getKeyWords(command):
    nlp = BosonNLP("IrtCRUKX.4360.giOuq49VR3V-")
    r = nlp.extract_keywords(command, top_k=3)
    l = []
    for k, v in r:
        v = v.encode('utf8')
        l.append(v)
    return l
コード例 #4
0
def getKeyWords(command):
    nlp = BosonNLP("IrtCRUKX.4360.giOuq49VR3V-")
    r = nlp.extract_keywords(command, top_k=3)
    l = []
    for k, v in r:
        v = v.encode("utf8")
        l.append(v)
    return l
コード例 #5
0
ファイル: views.py プロジェクト: mmlfs/Tutu
def getKeyWords(command):
	nlp = BosonNLP("ofW2OZMI.4712.UzT0VvLGGkdi")
	r = nlp.extract_keywords(command, top_k=3)
	l = []
	for k,v in r:
		v = v.encode('utf8')
		l.append(v)
	return l
コード例 #6
0
def extract_keywords(text, top_num=10):
    """Extract Keywords."""
    # 注意:在测试时请更换为您的 API token
    nlp = BosonNLP('')
    result = nlp.extract_keywords(text, top_k=top_num)

    result_dict = {k: v for (v, k) in result}

    return result_dict
コード例 #7
0
def extract_keywords(text, top_num=10):
    """Extract Keywords."""
    # 注意:在测试时请更换为您的 API token
    nlp = BosonNLP("")
    result = nlp.extract_keywords(text, top_k=top_num)

    result_dict = {k: v for (v, k) in result}

    return result_dict
コード例 #8
0
class Scanner(object):
    """
    bosonnlp 中文分词
    """
    def __init__(self):
        self.nlp_handler = BosonNLP(API_TOKEN_BOSONNLP)

    def get_tag(self, content, remove_punctuations=False):
        """
        分词后的结果,返回的是每个词的列表
        """
        result = self.nlp_handler.tag(content)[0]
        if remove_punctuations:
            return [
                x for x, y in zip(result['word'], result['tag']) if y[0] != 'w'
            ]
        return result['word']

    def get_key_word(self, content, segmented=False):
        """提取关键词"""
        keywords = self.nlp_handler.extract_keywords(content, 2, segmented)
        firstkey, secondkey = keywords[0], keywords[1]
        return firstkey[1] if (firstkey[0] - secondkey[0]) > 0.3\
            else ' '.join([firstkey[1], secondkey[1]])
コード例 #9
0
def Key_word(text):
    nlp = BosonNLP("x-gOGutn.27554.G6_6QvdJafES")
    rest = nlp.extract_keywords(text, top_k=20)
    return rest
コード例 #10
0
class _BosonNLPWrapper(object):
    """
    NLP object using the BosonNLP API Python SDK.
    """

    news_categories = [
        'physical education', 'education', 'finance', 'society',
        'entertainment', 'military', 'domestic', 'science and technology',
        'the internet', 'real estate', 'international', 'women', 'car', 'game'
    ]

    def __init__(self, api_token=None):
        try:
            assert api_token is not None, "Please provide an API token"
        except AssertionError as e:
            raise

        self.token = api_token
        self.nlp = BosonNLP(self.token)

    def get_sentiment(self, text):

        pos, neg = self.nlp.sentiment(text)[0]

        return {'positive': pos, 'negative': neg}

    def classify_news(self, text):

        numbering = range(len(_BosonNLPWrapper.news_categories))
        cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories))

        clsfy_num = self.nlp.classify(text)[0]

        return cats_dict[clsfy_num]

    def extract_keywords(self, text, top_k=3):

        result = self.nlp.extract_keywords(
            text, top_k)  # outputs in sorted order of weight

        return [{result[i][1]: result[i][0]} for i in range(len(result))]

    def segment_words_and_tag(self, text):
        """
        Splits up text into segments of "words" and tags them with their respective part of speech.
        See: http://docs.bosonnlp.com/tag.html

        Parameters
        ----------
        text (string): text passage to segment into separate "words" and tags them with parts of speech

        Returns
        -------
        list of key-value pairs {word: part-of-speech-tag}
        """
        result = self.nlp.tag(text)[0]
        words = result['word']
        tags = result['tag']

        return [{words[i]: tags[i]} for i in range(len(words))]

    def get_summary(self, content, title='', pct_limit=0.2):
        """
        Extracts a new digest (summary) of the content.
        See: http://docs.bosonnlp.com/summary.html

        Parameters
        ----------
        text (string): text passage to summarize
        title (string): title of the passage (optional, may provide more accurate results)
        pct_limit (float): max length of the summary in terms of percentage of the original word count

        Returns
        -------
        string containing the summary of the passage
        """
        summary = self.nlp.summary(title, content, pct_limit)

        return summary
コード例 #11
0
def get_similar_text(text):
    BosonNLPKey = "BMRivntt.8194.5zwvLwj_ygkV"
    # ZfXxO6kv.10841.LZ_TDcJiiwrl
    nlp = BosonNLP(BosonNLPKey)

    # 获取关键词 [[0.8391345017584958, '病毒式'], [0.3802418301341705, '蔓延']]
    keyword_result = nlp.extract_keywords(text, top_k=10)
    print(keyword_result)
    # keyword_result = [[0.36443758441765906, '卖艺'], [0.26732109821670036, '路学院'], [0.2667011448568187, '挣钱'], [0.23801264121689353, '天目山'], [0.22618147770853975, '走时'], [0.22553389408212396, '孩子'], [0.21006863070452944, '一老一少'], [0.18830464004379927, '儿子'], [0.18298766176875855, '育才'], [0.17494405471962107, '红绿灯']]
    key_word_list = []
    for key_item in keyword_result:
        if key_item[0] > 0.15 and len(key_item[1]) < 10:
            key_word_list.append((key_item[1], key_item[0]))

    conn = pymysql.connect(host='127.0.0.1',
                           user='******',
                           passwd='Ryan',
                           db="omdb",
                           charset='UTF8')
    cur = conn.cursor()
    vec_tfidf = []
    for key in key_word_list:
        sql_str = "SELECT kid from opinionmonitor_keyword WHERE word = '{0}'".format(
            key[0])
        cur.execute(sql_str)
        for ii in cur:
            vec_tfidf.append((ii[0], key[1]))

    pid_list = []
    for key_id in vec_tfidf:
        sql_str = "SELECT pid_id from opinionmonitor_passagekeyword WHERE kid_id = {0}".format(
            key_id[0])
        cur.execute(sql_str)
        for ii in cur:
            pid_list.append(ii[0])

    # 统计重复次数
    pid_set = set(pid_list)
    corpus_tfidf = []
    p_id_list = []
    for pid in pid_set:
        c = pid_list.count(pid)

        # 大于2才加入结果集中
        if c > 0:
            sql_str = "SELECT kid_id, ratio from opinionmonitor_passagekeyword WHERE pid_id = {0}".format(
                pid)
            cur.execute(sql_str)
            kr_list = []
            for ii in cur:
                kr_list.append((ii[0], ii[1]))
            corpus_tfidf.append(kr_list)
            p_id_list.append(pid)

    cur.close()  #关闭游标
    conn.commit()  #向数据库中提交任何未解决的事务,对不支持事务的数据库不进行任何操作
    conn.close()  #关闭到数据库的连接,释放数据库资源

    # 计算cos距离
    print("比较矩阵")
    print(corpus_tfidf)
    print("文本向量")
    print(vec_tfidf)
    if corpus_tfidf == []:
        return dict()
    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[vec_tfidf]

    r_pid_dict = dict()
    sim_result_list = list(sims)

    for index, item in enumerate(sim_result_list):
        if item > 0.5:
            r_pid_dict[p_id_list[index]] = item

    result_list = []
    result_order_dict = OrderedDict()
    r_pid_dict = OrderedDict(sorted(r_pid_dict.items(), key=lambda x: x[1]))
    print(r_pid_dict)
    for k, v in r_pid_dict.items():
        passage_list = Passage.objects.filter(pid=k)
        passage = passage_list[0]
        passage.st = v
        item_tuple = passage.to_tuple()
        result_order_dict[item_tuple[0]] = item_tuple[1]
        # result_list.append(passage.to_tuple())
    return result_order_dict
コード例 #12
0
def main():
    global last_extrect_tag_time, last_extrect_tag_time
    token = json.load(open('./config/token.json','r'))
    if DEBUG:
        client = EvernoteClient(token=token['en_token'],sandbox=True)
    else:
        client = EvernoteClient(token=token['en_token'])
        client.service_host = 'app.yinxiang.com'

    print '现在服务器是:',client._get_endpoint()
    #bosonNlp
    nlp = BosonNLP(token['boson_nlp_token'])
    note_store = client.get_note_store()

    #获取上一次同步状态
    if os.path.exists(data_file('sync_state')):
        last_sync_state = json.load(open(data_file('sync_state'),'r'))
        last_update_count = last_sync_state['updateCount']
        last_extrect_tag_time = last_sync_state['currentTime']

    #获取当前同步状态
    currnet_sync_state = get_current_sync_state(note_store)

    if currnet_sync_state.updateCount > last_update_count:
        new_updated_count = currnet_sync_state.updateCount - last_update_count
        print currnet_sync_state.__dict__
        new_note_filter = NoteStore.NoteFilter()
        new_note_filter.order = Types.NoteSortOrder.CREATED
        new_notes = note_store.findNotes(new_note_filter,0,new_updated_count)
        print 'totalNumber:%d\tNoteListNum:%d' %(new_notes.totalNotes,len(new_notes.notes))
    else:
        print('没有新增更新...')
        exit(1)

    # 获取用户的所有tags
    tags = Tags(note_store=note_store)
    alltags = tags.tags
    print '标签云:\n'
    print '\t'.join(alltags.keys())

    #操作新note
    for note in new_notes.notes:
        #如果笔记创建时间小于上次同步时间
        if note.created <= last_extrect_tag_time:
            continue
        print '\n'+'*'*120
        content = note_store.getNoteContent(note.guid)
        print "guid:%s\ntitle:%s\ncreated:%s\n作者:%s" %(note.guid,note.title,note.created,note.attributes.author)

        print 'author:%s\nsource:%s\nsourceURL:%s\nsourceApplication:%s' %(note.attributes.author,note.attributes.source,note.attributes.sourceURL,note.attributes.sourceApplication)

        if not note.attributes.sourceURL:
            continue
        print "现有标签(tags):%s" %(",".join(note_store.getNoteTagNames(note.guid)))
        print '-'*120
        #print "内容(%d):created:%s,\n%s" %(note.contentLength,note.created,content)

        #解析note xml 提取出所有的文字
        try:
            parser = ET.XMLParser()
            parser.entity['nbsp'] = ''
            parser.entity['ldquo'] = ''
            parser.entity['rdquo'] = ''
            parser.entity['hellip'] = ''
            tree = ET.parse(StringIO(content),parser=parser)
        except Exception,data:
            print 'ElementTree parser error'
            print content
            print 'errorData:'
            print data
            print 'exception:'
            print Exception
            exit(1)
        en_note = tree.findall('.')[0]

        content_string = ''.join(en_note.itertext())

        #写入文件
        with codecs.open(note_file(note.guid),'w+',encoding='utf-8') as f:
            f.write(content_string)
        #通过BosonNLP 拿到文章命名实体
        ner_tag_guid_list = []
        ner_tag_name_list = []
        ner = Ner(content_string).process(nlp)
        entites = ner.collect_type_entity(count=1)
        for entity in entites:
            tag = tags.add(entity)
            ner_tag_guid_list.append(tag.guid)
            ner_tag_name_list.append(tag.name)
        #通过 BosonNLP 拿到文章的关键字
        extract_keywords =  nlp.extract_keywords(content_string,top_k=20)
        keywords = [item[1].upper() for item in extract_keywords]
        print '通过 BosonNLP extract_keywords 拿到文章的前20个关键字:'
        for keyword in extract_keywords:
            print '%s \t %s' %(keyword[1],keyword[0])
        print '-'*120
        #对比 找出交集tag的guid
        keywords_tag_guid_list = []
        newKeyWords = []
        for keyword in keywords:
            if tags.exist(keyword):
                existTag = tags.get(keyword)
                keywords_tag_guid_list.append(existTag.guid)
                newKeyWords.append(existTag.name)
        print '\nextract_keywords与自己所有tag的交集:'
        print '\t'.join(newKeyWords)

        #追加新笔记的tags
        new_tag_guid_list = list(set(keywords_tag_guid_list).union(set(ner_tag_guid_list)))
        print 'extract_keywords+ner的tag:'
        newKeyWords.extend(ner_tag_name_list)
        print '\t'.join(newKeyWords)

        if note.tagGuids:
            note.tagGuids.extend(new_tag_guid_list)
        else:
            note.tagGuids = new_tag_guid_list

        note_store.updateNote(note)
コード例 #13
0
            results.append(item[_attribution])
            # pprint.pprint(item[_attribution])
        return results
    for item in _cursor:
        pprint.pprint(item)
    return

# Connect to mongodb
client = MongoClient("mongodb://localhost:27017/")
db = client.lagou
collection = db.lagouJob



# Find by company name
cursor = collection.find({"companyName":"深圳市贝利美维软件有限公司"})

# Find by Job Title
# cursor = collection.find({"jobTitle":{"$regex": u"区块链"}})
result = decodeResult(cursor, "jobDescription")
resultContent = "".join(result)
nlpResult = nlp.extract_keywords(resultContent, top_k=100)

f = open('companyKeywords.txt', 'w')
for weight, word in nlpResult:
    outString = str(weight) + " "+ str(word) + "\n"
    print(weight, word)
    f.write(outString)
f.close() 

コード例 #14
0
i = 77
id = rlaq_u2.loc[i, 'id']
ajbh = rlaq_u2.loc[i, 'ajbh']
fssj = pd.to_datetime(rlaq_u2.loc[i, 'time'])
txt = rlaq_u2.loc[i, 'jyaq']
txt0 = txt
place = rlaq_u2.loc[i, 'place']
print(txt0)

#提取时间、地点、人物
result = nlp.ner(txt)[0]
words = result['word']
entities = result['entity']
Btime = []
Bplace = []
Bpeople = []
for entity in entities:
    if entity[2] == 'time':
        Btime.append(''.join(words[entity[0]:entity[1]]))
    if entity[2] == 'location':
        Bplace.append(''.join(words[entity[0]:entity[1]]))
    if entity[2] == 'person_name':
        Bpeople.append(''.join(words[entity[0]:entity[1]]))
print('时间:', ' | '.join(Btime))
print('地点:', ' | '.join(Bplace))
print('人物:', ' | '.join(Bpeople))

#提取关键词
kw = nlp.extract_keywords(txt, top_k=10)
for w, k in kw:
    print('关键词:', k, ' , ', '权重:', w)
コード例 #15
0
class BosonNlpp:
    def __init__(self):
        self.bonlp = BosonNLP('IKBIoANy.14545.A7GCYBnT9jIB')

    #情感分析
    def testSentiment(self, s):
        result = self.bonlp.sentiment(s)
        return result
        #print(result)

    #命名实体识别
    def lexicalAnalysis(self, s):
        result = self.bonlp.ner(s)[0]
        return result

    #依存文法分析
    def textDependency(self, s):
        result = self.bonlp.depparser(s)
        return result

    #关键词提取
    def testKeywords(self, s):
        result = self.bonlp.extract_keywords(s, top_k=10)
        return result

    #新闻分类
    def textClassify(self, s):
        resultlist = self.bonlp.classify(s)
        classifys = {
            0: '体育',
            1: '教育',
            2: '财经',
            3: '社会',
            4: '娱乐',
            5: '军事',
            6: '国内',
            7: '科技',
            8: '互联网',
            9: '房产',
            10: '国际',
            11: '女人',
            12: '汽车',
            13: '游戏'
        }
        return (classifys[resultlist[0]])

    #语义联想
    def lexicalSynonym(self, term):
        result = self.bonlp.suggest(term, top_k=10)
        return result

    #分词与词性标注
    def fenci(self, s):
        result = self.bonlp.tag(s)
        return result

    def newssubstract(self, s):
        #s=s.encode('utf8')
        s = s.decode('utf-8')
        result = self.bonlp.summary('', s)
        return result
コード例 #16
0
class _BosonNLPWrapper(object):
    """
    NLP object using the BosonNLP API Python SDK.
    """

    news_categories = ['physical education', 'education', 'finance', 'society', 'entertainment', 'military',
                       'domestic', 'science and technology', 'the internet', 'real estate', 'international',
                       'women', 'car', 'game']

    def __init__(self, api_token=None):
        try:
            assert api_token is not None, "Please provide an API token"
        except AssertionError as e:
            raise

        self.token = api_token
        self.nlp = BosonNLP(self.token)


    def get_sentiment(self, text):
        """
        Performs sentiment analysis on a text passage (works for Chinese text).
        See: http://docs.bosonnlp.com/sentiment.html

        Parameters
        ----------
        text (string): text passage to be analyzed for sentiment


        Returns
        -------
        dictionary with 'positive' and 'negative' as keys with their respective weights as values

        >>> nlp = BosonNLPWrapper('')
        >>> nlp.get_sentiment('不要打擾我')
        {'positive': 0.3704911989140307, 'negative': 0.6295088010859693}
        >>> nlp.get_sentiment('我很高興跟你見面')
        {'positive': 0.856280735624867, 'negative': 0.14371926437513308}
        """
        pos, neg = self.nlp.sentiment(text)[0]

        return {'positive': pos, 'negative': neg}


    def classify_news(self, text):
        """
        Classifies news text into 14 different categories.
        See: http://docs.bosonnlp.com/classify.html

        Parameters
        ----------
        text (string): text passage to classify into news categories defined in news_categories

        Returns
        -------
        one of the 14 categories in news_categories that the text was classified into
        """
        numbering = range(len(_BosonNLPWrapper.news_categories))
        cats_dict = dict(zip(numbering, _BosonNLPWrapper.news_categories))

        clsfy_num = self.nlp.classify(text)[0]

        return cats_dict[clsfy_num]


    def extract_keywords(self, text, top_k=3):
        """
        Extracts the top k keywords and the weight of each word in the text.
        See: http://docs.bosonnlp.com/keywords.html

        Parameters
        ----------
        text (string): text passage from which to extract keywords
        top_k (integer): number of keywords to return

        Returns
        -------
        list of key-value pairs {word: weight}


        >>> nlp = BosonNLPWrapper('')
        >>> nlp.extract_keywords('我最愛老虎堂,奶茶香醇,波霸彈Q 好香的黑糖味')
        [{'波霸彈': 0.5980681967308248}, {'黑糖': 0.4699792421671365}, {'香醇': 0.4497614275300947}]
        """
        result = self.nlp.extract_keywords(text, top_k)  # outputs in sorted order of weight

        return [{result[i][1]: result[i][0]} for i in range(len(result))]


    def segment_words_and_tag(self, text):
        """
        Splits up text into segments of "words" and tags them with their respective part of speech.
        See: http://docs.bosonnlp.com/tag.html

        Parameters
        ----------
        text (string): text passage to segment into separate "words" and tags them with parts of speech

        Returns
        -------
        list of key-value pairs {word: part-of-speech-tag}
        """
        result = self.nlp.tag(text)[0]
        words = result['word']
        tags = result['tag']

        return [{words[i]: tags[i]} for i in range(len(words))]


    def get_summary(self, content, title='', pct_limit=0.2):
        """
        Extracts a new digest (summary) of the content.
        See: http://docs.bosonnlp.com/summary.html

        Parameters
        ----------
        text (string): text passage to summarize
        title (string): title of the passage (optional, may provide more accurate results)
        pct_limit (float): max length of the summary in terms of percentage of the original word count

        Returns
        -------
        string containing the summary of the passage
        """
        summary = self.nlp.summary(title, content, pct_limit)

        return summary