Exemple #1
0
def get_keywords(art_sents, abs_sents):
    """ greedily match summary sentences to article sentences"""
    keywords = {}
    art = ' '.join(art_sents[1:])
    abss = ' '.join(abs_sents)
    #3: RAKE keywords for each doc
    rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=5)
    keywords["rake_art"] = rake.run(art)
    keywords["rake_abs"] = rake.run(abss)

    #4: TF-IDF keywords for processed text
    art_frequencies = {}
    abs_frequencies = {}
    document_count = 1

    keywords["tfidf_freq_art"] = tfidf.get_word_frequencies(art)
    for word in keywords["tfidf_freq_art"]:
        art_frequencies.setdefault(word, 0)
        art_frequencies[word] += 1
    keywords["tfidf_freq_abs"] = tfidf.get_word_frequencies(abss)
    for word in keywords["tfidf_freq_abs"]:
        abs_frequencies.setdefault(word, 0)
        abs_frequencies[word] += 1

    sortby = lambda x: x[1]["score"]
    for word in keywords["tfidf_freq_art"].items():
        word_frequency = word[1]["frequency"]
        docs_with_word = art_frequencies[word[0]]
        word[1]["score"] = tfidf.calculate(word_frequency, document_count,
                                           docs_with_word)
    sortby = lambda x: x[1]["score"]
    for word in keywords["tfidf_freq_abs"].items():
        word_frequency = word[1]["frequency"]
        docs_with_word = abs_frequencies[word[0]]
        word[1]["score"] = tfidf.calculate(word_frequency, document_count,
                                           docs_with_word)

    keywords["tfidf_art"] = sorted(keywords["tfidf_freq_art"].items(),
                                   key=sortby,
                                   reverse=True)
    keywords["tfidf_abs"] = sorted(keywords["tfidf_freq_abs"].items(),
                                   key=sortby,
                                   reverse=True)

    #5. TextRank
    keywords['textrank_art'] = textrank.extractKeyphrases(art)
    keywords['textrank_abs'] = textrank.extractKeyphrases(abss)

    return keywords['rake_art'], keywords['rake_abs'], keywords[
        'tfidf_art'], keywords['tfidf_art'], keywords[
            'textrank_art'], keywords['textrank_art']
Exemple #2
0
def textRank():
    """
    Calcuate the textRank of today's pastebin.com pastes at MongoDB, update the keywords to elk.
    """
    loadConfig()

    client = MongoClient(mongo_host, mongo_port)
    elk = Elastic(elk_host, elk_port)
    db = client['pastebin']
    pastebin = db.pastebin

    now = datetime.datetime.now()
    date = now.strftime("%Y-%m-%d")
    startdate = datetime.datetime.strptime(date, '%Y-%m-%d')
    print(startdate)

    cursor = pastebin.find({"@timestamp": {
        "$gt": startdate
    }})
    print("article count: %d" % cursor.count())
    if cursor.count() == 0:
        print('Today %s has no data to analysis.' % (date))
        sys.exit(0)
    article = cursor[:]

    for row in article:
        try:
            text = row['text']
            keyphrases = extractKeyphrases(text, keyWordNum)
            jbody = {"doc": {"tfidf": keyphrases}}
            elk.update2elk(elk_index, elk_type, row['pid'], jbody)
        except:
            print("pastebin textRank error.")
            continue
    print("\nComplete!")
Exemple #3
0
def nouse():
    #3: RAKE keywords for each page
    print("=== 3. RAKE")
    rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=5)
    for page in processed_pages:
        page["rake_results"] = rake.run(page["text"])
    print("RAKE: %d" % (time.time() - start_time))

    #4: TF-IDF keywords for processed text
    print("=== 4. TF-IDF")
    document_frequencies = {}
    document_count = len(processed_pages)
    for page in processed_pages:
        page["tfidf_frequencies"] = tfidf.get_word_frequencies(page["text"])
        for word in page["tfidf_frequencies"]:
            document_frequencies.setdefault(word, 0)
            document_frequencies[word] += 1

    sortby = lambda x: x[1]["score"]
    for page in processed_pages:
        for word in page["tfidf_frequencies"].items():
            word_frequency = word[1]["frequency"]
            docs_with_word = document_frequencies[word[0]]
            word[1]["score"] = tfidf.calculate(word_frequency, document_count,
                                               docs_with_word)

        page["tfidf_results"] = sorted(page["tfidf_frequencies"].items(),
                                       key=sortby,
                                       reverse=True)
    print("TF-IDF: %d" % (time.time() - start_time))

    #5. TextRank
    print("=== 5. TextRank")
    for page in processed_pages:
        textrank_results = textrank.extractKeyphrases(page["text"])
        page["textrank_results"] = sorted(textrank_results.items(),
                                          key=lambda x: x[1],
                                          reverse=True)
    print("TextRank: %d" % (time.time() - start_time))

    #6. Results
    print("=== 6. Results")
    for page in processed_pages:
        print("-------------------------")
        print("URL: %s" % page["url"])
        print("RAKE:")
        for result in page["rake_results"][:5]:
            print(" * %s" % result[0])
        print("TF-IDF:")
        for result in page["tfidf_results"][:5]:
            print(" * %s" % result[0])
        print("TextRank:")
        for result in page["textrank_results"][:5]:
            print(" * %s" % result[0])

    end_time = time.time() - start_time
    print('Done. Elapsed: %d' % end_time)
def textrank_kw(data, field='description'):
    print 'Extracting textrank keywords'

    kw_field = '%s_textrank_kw' % field

    for i, d in enumerate(data):
        if i and i % 1000 == 0:
            print '\t', i

        keywords = extractKeyphrases(d[field])
        keywords = map(string.upper, keywords)
        d[kw_field] = list(set(keywords))

    return data
Exemple #5
0
def textrank_kw(data, field='description'):
    print 'Extracting textrank keywords'

    kw_field = '%s_textrank_kw' % field

    for i, d in enumerate(data):
        if i and i % 1000 == 0:
            print '\t', i

        keywords = extractKeyphrases(d[field])
        keywords = map(string.upper, keywords)
        d[kw_field] = list(set(keywords))

    return data
Exemple #6
0
def textrank_analysis(peak_tweets, orig_tag=''):
    """
    Takes a list of lists of tweets, with each sublist being tweets within a peak
    Concatenate & clean tweets.
    Returns a list of #1 keyword for each sublist determined by TextRank.
    """
    textrank_kw = []
    for peak in peak_tweets:
        text = '.'.join(peak)
        text = clean_tweet(text, orig_tag)
        peak_textrank = textrank.extractKeyphrases(text)
        textrank_keys = sorted(peak_textrank.items(),
                               key=lambda x: x[1],
                               reverse=True)[0][0]
        textrank_kw.append(textrank_keys)
    return textrank_kw
def parseHtml(iters, category, match, i):
    matchlink = ''
    title = ''
    date = ''
    content = ''
    keyword = ''
    try:
        matchlink = match.find(name='a').get('href')
        link = 'https://www.allsingaporestuff.com' + matchlink
    except:
        print(BackColors.WARNING + 'The ' + str(iters) + 'th Page ' + str(i) +
              'th link Error' + BackColors.ENDC)
        link = ''
    if matchlink:
        contentHtml = requests.get(url=link,
                                   headers=RequestHeader.browserHeader)
        contentSoup = BeautifulSoup(contentHtml.text, 'lxml')
        date = contentSoup.find_all(name='span',
                                    attrs={'property':
                                           'dc:date dc:created'})[0]['content']

        title = contentSoup.find_all(
            attrs={'property': 'og:title'})[0]['content']
        content = contentSoup.find_all(
            attrs={'name': 'twitter:description'})[0]['content']

        keywords = extractSentences(content)
        keywords = extractKeyphrases(keywords)

        lettersOnly = re.sub("[^a-zA-Z]", " ", " ".join(keywords))
        lowerCase = lettersOnly.lower()
        words = lowerCase.split()
        cachedstopwords = open(Paths.textPath + 'stopwords.txt').read()
        stopwords = cachedstopwords.split('\n')
        words = [w for w in words if w not in stopwords]
        keyword = " ".join(words)

    d = AllSingaporeStuffDateParse(date)

    data = (iters, title, link, category, keyword, d, content)

    if matchlink and keyword and title and content:
        insertData(data)
Exemple #8
0
def parseHtml(iters, category, match, i):
    matchlink = ''
    title = ''
    date = ''
    content = ''
    keyword = ''
    try:
        matchlink = match.find(name='h3').find(name='a').get('href')
        link = matchlink
    except:
        print(BackColors.WARNING + 'The ' + str(iters) + 'th Page ' + str(i) +
              'th link Error' + BackColors.ENDC)
        link = ''
    try:
        date = match.find(name='div', attrs={'class', 'post-date'}).get_text()
    except:
        date = ""
    d = MotherShipDateParse(date)

    if matchlink:
        contentHtml = requests.get(url=link,
                                   headers=RequestHeader.browserHeader)
        contentSoup = BeautifulSoup(contentHtml.text, 'lxml')
        title = contentSoup.find(name='h1',
                                 attrs={'class', 'content-title'}).get_text()
        content = contentSoup.find(name='div',
                                   attrs={'class', 'post-content'}).get_text()

        keywords = extractSentences(content)
        keywords = extractKeyphrases(keywords)

        lettersOnly = re.sub("[^a-zA-Z]", " ", " ".join(keywords))
        lowerCase = lettersOnly.lower()
        words = lowerCase.split()
        cachedstopwords = open(Paths.textPath + 'stopwords.txt').read()
        stopwords = cachedstopwords.split('\n')
        words = [w for w in words if w not in stopwords]
        keyword = " ".join(words[:25])

    data = (iters, title, link, category, keyword, d, lettersOnly)

    if matchlink and keyword and title and content:
        insertData(data)
def watching_stories(domain_list):
    """
    watching stories of competitors
    :param domain_list: targeted competitors domain names
    :return:
    """
    fetch = fetch_time_line()
    db_category_list = __category_service.find_all_categories()
    db_interest_list = __interest_service.find_all_interests()
    for domain in domain_list:
        r = requests.get("https://api.newswhip.com/v1/publisher/" + domain + "/1?key="+newswhip_key)
        response = json.loads(r.text)
        logging.info("Domain: " + domain + " & No of Articles: " + str(len(response['articles'])))
        for item in response['articles']:
            try:
                article_info = domparser.element_picker(item['link'].encode('utf-8'))
                if article_info['title'] is not None or article_info['feature_image'] is not None or article_info['url'] is not None:
                    article = {'title': '', 'url': '', 'description': '', 'keywords': '', 'feature_image': '','New_score': '',
                            'max_new_score': '', 'fb_like': '', 'tweet_count': '', 'publisher': '', "uuid": '', 'published': '',
                               'category': [], 'interest': [], 'fetch': '', 'created_keys':[]}
                    if item['headline'] is None:
                        article['title'] = article_info['title']
                    else:
                        article['title'] = item['headline'].encode('utf-8')

                    if item['link'] is None:
                        article['url'] = article_info['url']
                    else:
                        article['url'] = item['link'].encode('utf-8')

                    if item['excerpt'] is None:
                        article['description'] = article_info['description']
                    else:
                        article['description'] = item['excerpt']

                    if item['keywords'] is None:
                        article['keywords'] = article_info['keywords']
                    else:
                        article['keywords'] = (item['keywords']).split(',')

                    if item['image_link'] is None:
                        article['feature_image'] = article_info['feature_image']
                    else:
                        article['feature_image'] = item['image_link']
                    if 'new_score' in item:
                        article['New_score'] = item['nw_score']
                    else:
                        article['New_score'] = 0
                    if 'max_new_score' in item:
                        article['max_new_score'] = item['max_nw_score']
                    else:
                        article['max_new_score'] = 0
                    if 'total_engagement_count' in item['fb_data']:
                        article['fb_like'] = item['fb_data']['total_engagement_count']
                    else:
                        article['fb_like'] = 0
                    if 'tw_count' in item['tw_data']:
                        article['tweet_count'] = item['tw_data']['tw_count']
                    else:
                        item['tw_data']['tw_count'] = 0
                    if 'publisher' in item['source']:
                        article['publisher'] = item['source']['publisher']
                    else:
                        article['publisher'] = "None"
                    if 'uuid' in item:
                        article['uuid'] = item['uuid']
                    else:
                        article['uuid'] = 'None'
                    if 'publication_timestamp' in item:
                        article['published'] = time.strftime('%Y-%m-%d %H:%M', time.localtime(item['publication_timestamp']/1000.0))
                    else:
                        article['published'] = "None"
                    article['fetch'] = current_epoch_time(datetime.now())

                    dummy_category = []
                    for i in article_info['category']:
                        split_list = i.split(',')
                        for itr in split_list:
                            if itr not in dummy_category:
                                dummy_category.append(itr.lower())

                    article_info['category'] = dummy_category
                    if not any(category['category'] in article_info['category'] for category in db_category_list):
                        for category_item in article_info['category']:
                            for interest in db_interest_list:
                                if category_item == interest['interest']:
                                    if category_item not in article['interest']:
                                        article['interest'].append(category_item)

                        if len(article['interest']) <= 0:
                            article['category'] = article_info['category']
                        else:
                            article['category'] = []
                            for int_item in article['interest']:
                                current_interest = filter(lambda member:int_item == member['interest'], db_interest_list)
                                if len(current_interest) == 1:
                                    current_category = filter(lambda member: current_interest[0]['category_id'] == member['_id'], db_category_list)
                                if len(current_category) == 1:
                                    article['category'].append(current_category[0]['category'])

                    else:
                        if article['keywords'] is not None:
                            (article['interest'], return_category_ids) = checking_interest(article['keywords'])
                        article['category'] = article_info['category']

                    key_phrases_list = []
                    raw_key_phrases_list = []
                    interest_category_id = []
                    if article_info['keywords']:
                        keywords_key_phrases = (''.join(map(str, ((article_info['keywords'][0]).decode('ascii', 'ignore')).lower()))).split(",")
                        key_phrases_list += keywords_key_phrases
                        raw_key_phrases_list += keywords_key_phrases
                    if article_info['title']:
                        title_key_phrases = extractKeyphrases(article_info['title'].decode('ascii', 'ignore'))
                        key_phrases_list += list(title_key_phrases)
                        raw_key_phrases_list.append(str(article_info['title'].decode('ascii', 'ignore')))
                    if article_info['description']:
                        description_key_phrases = extractKeyphrases(article_info['description'].decode('ascii', 'ignore'))
                        key_phrases_list += list(description_key_phrases)
                        raw_key_phrases_list.append(str(article_info['description'].decode('ascii', 'ignore')))
                    d = Counter(key_phrases_list)
                    keys_to_remove = ['', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is', 'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay', 'navbharat', '-navbharat', 'navbharat times', 'samay', 'india']
                    refactor_key_list = []
                    for key in list(d.keys()):
                        if (key.strip()).lower() not in keys_to_remove and (key.strip()).lower() not in refactor_key_list:
                            refactor_key_list.append((key.strip()).lower())
                    article['created_keys'] = refactor_key_list
                    if article['created_keys'] is not None:
                        (created_interest, interest_category_id) = checking_interest(raw_key_phrases_list)
                        if created_interest is not None:
                            article['interest'] += created_interest
                        if interest_category_id is not None:
                            cat_dict = Counter(interest_category_id)
                            top_order_category = ''
                            top = 0
                            for index, cat_item in enumerate(cat_dict.keys()):
                                if cat_dict[cat_dict.keys()[index]] >= top:
                                    top_order_category = cat_dict.keys()[index]
                                    top = cat_dict[cat_dict.keys()[index]]
                            if top_order_category:
                                supposed_category = __category_service.find_category(top_order_category)
                                article['category'].append(supposed_category['category'])

                    if article['interest']:
                        article['status'] = True
                    else:
                        article['status'] = False
                    __story_service.save_story(article)
                    __fetch_service.save_fetch(fetch)

            except Exception as ex:
                logging.info("Runtime Error: " + ex)
def textrank_parallel(text):
    keywords = extractKeyphrases(text)
    keywords = map(string.upper, keywords)
    return list(set(keywords))
def execute(cleanse_method, pages):
    """Execute RAKE and TF-IDF algorithms on each page and output top scoring phrases"""

    start_time = time.time()

    #1: Initialize a URL reader with local caching to be kind to the internet
    print("=== 1. Initialize")
    reader = contentloader.CacheableReader(CACHE_FOLDER, cleanse_method)
    print("Initialized: %d" % (time.time() - start_time))

    #2: Collect raw text for pages
    print("=== 2. Collect Raw Text")
    processed_pages = []
    for page in pages:
        page_text = reader.get_site_text(page)
        processed_pages.append({"url": page, "text": page_text})
    print("Collected: %d" % (time.time() - start_time))

    #3: RAKE keywords for each page
    print("=== 3. RAKE")
    rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=5)
    for page in processed_pages:
        page["rake_results"] = rake.run(page["text"])
    print("RAKE: %d" % (time.time() - start_time))

    #4: TF-IDF keywords for processed text
    print("=== 4. TF-IDF")
    document_frequencies = {}
    document_count = len(processed_pages)
    for page in processed_pages:
        page["tfidf_frequencies"] = tfidf.get_word_frequencies(page["text"])
        for word in page["tfidf_frequencies"]:
            document_frequencies.setdefault(word, 0)
            document_frequencies[word] += 1

    sortby = lambda x: x[1]["score"]
    for page in processed_pages:
        for word in page["tfidf_frequencies"].items():
            word_frequency = word[1]["frequency"]
            docs_with_word = document_frequencies[word[0]]
            word[1]["score"] = tfidf.calculate(word_frequency, document_count, docs_with_word)

        page["tfidf_results"] = sorted(page["tfidf_frequencies"].items(), key=sortby, reverse=True)
    print("TF-IDF: %d" % (time.time() - start_time))

    #5. TextRank
    print("=== 5. TextRank")
    for page in processed_pages:
        textrank_results = textrank.extractKeyphrases(page["text"])
        page["textrank_results"] = sorted(textrank_results.items(), key=lambda x: x[1], reverse=True)
    print("TextRank: %d" % (time.time() - start_time))

    #6. Results
    print("=== 6. Results")
    for page in processed_pages:
        print("-------------------------")
        print("URL: %s" % page["url"])
        print("RAKE:")
        for result in page["rake_results"][:5]:
            print(" * %s" % result[0])
        print("TF-IDF:")
        for result in page["tfidf_results"][:5]:
            print(" * %s" % result[0])
        print("TextRank:")
        for result in page["textrank_results"][:5]:
            print(" * %s" % result[0])

    end_time = time.time() - start_time
    print('Done. Elapsed: %d' % end_time)
Exemple #12
0
def textrank_parallel(text):
    keywords = extractKeyphrases(text)
    keywords = map(string.upper, keywords)
    return list(set(keywords))
def watching_stories(domain_list):
    """
    watching stories of competitors
    :param domain_list: targeted competitors domain names
    :return:
    """
    fetch = fetch_time_line()
    db_category_list = __category_service.find_all_categories()
    db_interest_list = __interest_service.find_all_interests()
    for domain in domain_list:
        r = requests.get("https://api.newswhip.com/v1/publisher/" + domain +
                         "/1?key=" + newswhip_key)
        response = json.loads(r.text)
        logging.info("Domain: " + domain + " & No of Articles: " +
                     str(len(response['articles'])))
        for item in response['articles']:
            try:
                article_info = domparser.element_picker(
                    item['link'].encode('utf-8'))
                if article_info['title'] is not None or article_info[
                        'feature_image'] is not None or article_info[
                            'url'] is not None:
                    article = {
                        'title': '',
                        'url': '',
                        'description': '',
                        'keywords': '',
                        'feature_image': '',
                        'New_score': '',
                        'max_new_score': '',
                        'fb_like': '',
                        'tweet_count': '',
                        'publisher': '',
                        "uuid": '',
                        'published': '',
                        'category': [],
                        'interest': [],
                        'fetch': '',
                        'created_keys': []
                    }
                    if item['headline'] is None:
                        article['title'] = article_info['title']
                    else:
                        article['title'] = item['headline'].encode('utf-8')

                    if item['link'] is None:
                        article['url'] = article_info['url']
                    else:
                        article['url'] = item['link'].encode('utf-8')

                    if item['excerpt'] is None:
                        article['description'] = article_info['description']
                    else:
                        article['description'] = item['excerpt']

                    if item['keywords'] is None:
                        article['keywords'] = article_info['keywords']
                    else:
                        article['keywords'] = (item['keywords']).split(',')

                    if item['image_link'] is None:
                        article['feature_image'] = article_info[
                            'feature_image']
                    else:
                        article['feature_image'] = item['image_link']
                    if 'new_score' in item:
                        article['New_score'] = item['nw_score']
                    else:
                        article['New_score'] = 0
                    if 'max_new_score' in item:
                        article['max_new_score'] = item['max_nw_score']
                    else:
                        article['max_new_score'] = 0
                    if 'total_engagement_count' in item['fb_data']:
                        article['fb_like'] = item['fb_data'][
                            'total_engagement_count']
                    else:
                        article['fb_like'] = 0
                    if 'tw_count' in item['tw_data']:
                        article['tweet_count'] = item['tw_data']['tw_count']
                    else:
                        item['tw_data']['tw_count'] = 0
                    if 'publisher' in item['source']:
                        article['publisher'] = item['source']['publisher']
                    else:
                        article['publisher'] = "None"
                    if 'uuid' in item:
                        article['uuid'] = item['uuid']
                    else:
                        article['uuid'] = 'None'
                    if 'publication_timestamp' in item:
                        article['published'] = time.strftime(
                            '%Y-%m-%d %H:%M',
                            time.localtime(item['publication_timestamp'] /
                                           1000.0))
                    else:
                        article['published'] = "None"
                    article['fetch'] = current_epoch_time(datetime.now())

                    dummy_category = []
                    for i in article_info['category']:
                        split_list = i.split(',')
                        for itr in split_list:
                            if itr not in dummy_category:
                                dummy_category.append(itr.lower())

                    article_info['category'] = dummy_category
                    if not any(category['category'] in article_info['category']
                               for category in db_category_list):
                        for category_item in article_info['category']:
                            for interest in db_interest_list:
                                if category_item == interest['interest']:
                                    if category_item not in article[
                                            'interest']:
                                        article['interest'].append(
                                            category_item)

                        if len(article['interest']) <= 0:
                            article['category'] = article_info['category']
                        else:
                            article['category'] = []
                            for int_item in article['interest']:
                                current_interest = filter(
                                    lambda member: int_item == member[
                                        'interest'], db_interest_list)
                                if len(current_interest) == 1:
                                    current_category = filter(
                                        lambda member: current_interest[0][
                                            'category_id'] == member['_id'],
                                        db_category_list)
                                if len(current_category) == 1:
                                    article['category'].append(
                                        current_category[0]['category'])

                    else:
                        if article['keywords'] is not None:
                            (article['interest'],
                             return_category_ids) = checking_interest(
                                 article['keywords'])
                        article['category'] = article_info['category']

                    key_phrases_list = []
                    raw_key_phrases_list = []
                    interest_category_id = []
                    if article_info['keywords']:
                        keywords_key_phrases = (''.join(
                            map(str, ((article_info['keywords'][0]).decode(
                                'ascii', 'ignore')).lower()))).split(",")
                        key_phrases_list += keywords_key_phrases
                        raw_key_phrases_list += keywords_key_phrases
                    if article_info['title']:
                        title_key_phrases = extractKeyphrases(
                            article_info['title'].decode('ascii', 'ignore'))
                        key_phrases_list += list(title_key_phrases)
                        raw_key_phrases_list.append(
                            str(article_info['title'].decode(
                                'ascii', 'ignore')))
                    if article_info['description']:
                        description_key_phrases = extractKeyphrases(
                            article_info['description'].decode(
                                'ascii', 'ignore'))
                        key_phrases_list += list(description_key_phrases)
                        raw_key_phrases_list.append(
                            str(article_info['description'].decode(
                                'ascii', 'ignore')))
                    d = Counter(key_phrases_list)
                    keys_to_remove = [
                        '', ' ', '%', 'an', 'a', ',', 'ii', 'r', 'so', 'is',
                        'in', 'the', 'nbt', 'us', 'them', 's', '|', 'eisamay',
                        'navbharat', '-navbharat', 'navbharat times', 'samay',
                        'india'
                    ]
                    refactor_key_list = []
                    for key in list(d.keys()):
                        if (key.strip()).lower() not in keys_to_remove and (
                                key.strip()).lower() not in refactor_key_list:
                            refactor_key_list.append((key.strip()).lower())
                    article['created_keys'] = refactor_key_list
                    if article['created_keys'] is not None:
                        (created_interest, interest_category_id
                         ) = checking_interest(raw_key_phrases_list)
                        if created_interest is not None:
                            article['interest'] += created_interest
                        if interest_category_id is not None:
                            cat_dict = Counter(interest_category_id)
                            top_order_category = ''
                            top = 0
                            for index, cat_item in enumerate(cat_dict.keys()):
                                if cat_dict[cat_dict.keys()[index]] >= top:
                                    top_order_category = cat_dict.keys()[index]
                                    top = cat_dict[cat_dict.keys()[index]]
                            if top_order_category:
                                supposed_category = __category_service.find_category(
                                    top_order_category)
                                article['category'].append(
                                    supposed_category['category'])

                    if article['interest']:
                        article['status'] = True
                    else:
                        article['status'] = False
                    __story_service.save_story(article)
                    __fetch_service.save_fetch(fetch)

            except Exception as ex:
                logging.info("Runtime Error: " + ex)
Exemple #14
0
    #构建图计算 textrank
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    return sorted(((scores[i], s) for i, s in enumerate(sentences)),
                  reverse=True)


if __name__ == '__main__':
    document = """To Sherlock Holmes she is always the woman. I have
    seldom heard him mention her under any other name. In his eyes she
    eclipses and predominates the whole of her sex. It was not that he
    felt any emotion akin to love for Irene Adler. All emotions, and that
    one particularly, were abhorrent to his cold, precise but admirably
    balanced mind. He was, I take it, the most perfect reasoning and
    observing machine that the world has seen, but as a lover he would
    have placed himself in a false position. He never spoke of the softer
    passions, save with a gibe and a sneer. They were admirable things for
    the observer-excellent for drawing the veil from men’s motives and
    actions. But for the trained reasoner to admit such intrusions into
    his own delicate and finely adjusted temperament was to introduce a
    distracting factor which might throw a doubt upon all his mental
    results. Grit in a sensitive instrument, or a crack in one of his own
    high-power lenses, would not be more disturbing than a strong emotion
    in a nature such as his. And yet there was but one woman to him, and
    that woman was the late Irene Adler, of dubious and questionable
    memory.
    """
    TextRank(document)

    print(textrank.extractKeyphrases(document))
Exemple #15
0
def execute(cleanse_method, pages):
    """Execute RAKE and TF-IDF algorithms on each page and output top scoring phrases"""

    start_time = time.time()

    #1: Initialize a URL reader with local caching to be kind to the internet
    print("=== 1. Initialize")
    reader = contentloader.CacheableReader(CACHE_FOLDER, cleanse_method)
    print("Initialized: %d" % (time.time() - start_time))

    #2: Collect raw text for pages
    print("=== 2. Collect Raw Text")
    processed_pages = []
    for page in pages:
        page_text = reader.get_site_text(page)
        processed_pages.append({"url": page, "text": page_text})
    print("Collected: %d" % (time.time() - start_time))

    #3: RAKE keywords for each page
    print("=== 3. RAKE")
    rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=5)
    for page in processed_pages:
        page["rake_results"] = rake.run(page["text"])
    print("RAKE: %d" % (time.time() - start_time))

    #4: TF-IDF keywords for processed text
    print("=== 4. TF-IDF")
    document_frequencies = {}
    document_count = len(processed_pages)
    for page in processed_pages:
        page["tfidf_frequencies"] = tfidf.get_word_frequencies(page["text"])
        for word in page["tfidf_frequencies"]:
            document_frequencies.setdefault(word, 0)
            document_frequencies[word] += 1

    sortby = lambda x: x[1]["score"]
    for page in processed_pages:
        for word in page["tfidf_frequencies"].items():
            word_frequency = word[1]["frequency"]
            docs_with_word = document_frequencies[word[0]]
            word[1]["score"] = tfidf.calculate(word_frequency, document_count,
                                               docs_with_word)

        page["tfidf_results"] = sorted(page["tfidf_frequencies"].items(),
                                       key=sortby,
                                       reverse=True)
    print("TF-IDF: %d" % (time.time() - start_time))

    #5. TextRank
    print("=== 5. TextRank")
    for page in processed_pages:
        textrank_results = textrank.extractKeyphrases(page["text"])
        page["textrank_results"] = sorted(textrank_results.items(),
                                          key=lambda x: x[1],
                                          reverse=True)
    print("TextRank: %d" % (time.time() - start_time))

    #6. Results
    print("=== 6. Results")
    for page in processed_pages:
        print("-------------------------")
        print("URL: %s" % page["url"])
        print("RAKE:")
        for result in page["rake_results"][:5]:
            print(" * %s" % result[0])
        print("TF-IDF:")
        for result in page["tfidf_results"][:5]:
            print(" * %s" % result[0])
        print("TextRank:")
        for result in page["textrank_results"][:5]:
            print(" * %s" % result[0])

    end_time = time.time() - start_time
    print('Done. Elapsed: %d' % end_time)
Exemple #16
0
def execute(pages):
    """Execute RAKE and TF-IDF algorithms on each page and output top scoring phrases"""

    start_time = time.time()

    #2: Collect raw text for pages
    print("=== 2. Collect Raw Text from file")
    text = ""
    f = open(pages[0], "r")
    for line in f:
        #line = line.strip("\r")
        #line = line.strip("\n")
        text += line.lower()
    processed_pages = []
    for page in pages:
        page_text = text
        processed_pages.append({"url": pages[0], "text": page_text})
    print("Collected: %d" % (time.time() - start_time))

    #3: RAKE keywords for each page
    print("=== 3. RAKE")
    rake = RAKE.Rake(RAKE_STOPLIST, min_char_length=2, max_words_length=1)
    for page in processed_pages:
        page["rake_results"] = rake.run(page["text"])
    print("RAKE: %d" % (time.time() - start_time))

    #4: TF-IDF keywords for processed text
    print("=== 4. TF-IDF")
    document_frequencies = {}
    document_count = len(processed_pages)
    for page in processed_pages:
        page["tfidf_frequencies"] = tfidf.get_word_frequencies(page["text"])
        for word in page["tfidf_frequencies"]:
            document_frequencies.setdefault(word, 0)
            document_frequencies[word] += 1

    sortby = lambda x: x[1]["score"]
    for page in processed_pages:
        for word in page["tfidf_frequencies"].items():
            word_frequency = word[1]["frequency"]
            docs_with_word = document_frequencies[word[0]]
            word[1]["score"] = tfidf.calculate(word_frequency, document_count,
                                               docs_with_word)

        page["tfidf_results"] = sorted(page["tfidf_frequencies"].items(),
                                       key=sortby,
                                       reverse=True)
    print("TF-IDF: %d" % (time.time() - start_time))

    #5. TextRank
    print("=== 5. TextRank")
    for page in processed_pages:
        textrank_results = textrank.extractKeyphrases(page["text"])
        page["textrank_results"] = sorted(textrank_results.items(),
                                          key=lambda x: x[1],
                                          reverse=True)
    print("TextRank: %d" % (time.time() - start_time))

    #6. Results
    print("=== 6. Results")
    for page in processed_pages:
        print("-------------------------")
        print("URL: %s" % page["url"])
        print("RAKE:")
        for result in page["rake_results"][:5]:
            print(" * %s" % result[0], result[1])
        print("TF-IDF:")
        for result in page["tfidf_results"][:5]:
            print(" * %s" % result[0], result[1])
        print("TextRank:")
        for result in page["textrank_results"][:5]:
            print(" * %s" % result[0], result[1])

    end_time = time.time() - start_time
    print('Done. Elapsed: %d' % end_time)