Example #1
0
def cosine_similartity_bow(text_list):
    """
    This method computes the cosine similarity between texts
    :type text_list: list
    :param text_list: list of all text
    """
    # initalize all empty
    doc_specific_tokens = []

    all_tokens = []

    vectors = []

    # tokenize and append
    for text in text_list:

        tokenizer = NlpPreprocessing(text)

        doc_specific_tokens.append(tokenizer.word_lem_tokenize())

    # get all tokens
    for tokens in doc_specific_tokens:

        for token in tokens:

            all_tokens.append(token)

    line_loc()

    # remove repeated tokens
    unique_tokens = list(set(all_tokens))

    line_loc()

    # make vectors
    for token in range(len(doc_specific_tokens)):

        vectors.append(
            compute_bow_vectors(unique_tokens, doc_specific_tokens[token]))

    cosine = []

    # compute cosine
    for i in range(1, len(doc_specific_tokens)):

        cosine.append(compute_cosine(vectors[0], vectors[i]))

    return cosine
Example #2
0
    def eir_intersection_reduction(self,text,other_list_of_keywords):
        """
        This method uses spacys entity recogintion to recognize entities and applies intersection between the meta keywords to get most relevant keywords
        it returns a list of possible keywords

        :type text: string
        :param string: The text from which the keywords must be extracted

        :type other_list_of_keywords: list
        :param other_list_of_keywords: other keywords for the intersection

        """
        # load the english module
        # install 'en' model (python3 -m spacy download en)
        nlp = spacy.load('en')
        
        # process the doc
        doc  = nlp(text)

        # get the entities 
        keywords = doc.ents

        #---------------------------------------
        line_loc()
        highlight_fore(str(keywords))
        highlight_fore(other_list_of_keywords)
        line_loc()
        #---------------------------------------

        # initialize an empty list for the keywords
        final_keywords = []

        # intersect with other list of keywords
        for word in keywords:

            for item in other_list_of_keywords:

                if(str(word) in item) or (item in str(word)):

                    if(len(str(word))<=len(item)):

                        final_keywords.append(str(word))
                    else:
                        final_keywords.append(item)
        
        #---------------------------------------
        line_loc()
        print(final_keywords)
        line_loc()
        #---------------------------------------
        return set(final_keywords)
Example #3
0
    def __init__(self, URL):
        """
        This method initializes the URL in the class

        :type URL: string
        :param URL: The URL of the Website
        """
        self.URL = URL

        try:

            # initialize NewsPlease
            news_please_article = NewsPlease.from_url(self.URL)

            # set title
            self.title = news_please_article.title

            # set content
            self.content = news_please_article.text

            # if the content retreived is null raise an exception that would migrate the crawler from news please and lasse to goose
            if (len(self.content) == 0):
                raise Exception

            # set meta keywords
            self.meta_keywords = lassie.fetch(self.URL)["keywords"]

            # set meta description
            self.meta_description = news_please_article.description

            # top image url
            self.top_img_url = news_please_article.image_url

        except Exception as exception:

            highlight_back(
                "[Crawler] Crawler migrated from News-Please and Lassie to Goose due to an exception: {}"
                .format(exception), 'G')
            line_loc()

            try:

                # initialize Goose
                goose = Goose()

                # initialize the goose article object
                goose_article = goose.extract(self.URL)

                # assign title
                self.title = goose_article.title

                # assign content
                self.content = goose_article.cleaned_text

                # if the content retreived is null raise an exception that would migrate the crawler from goose to news please and lasse
                if (len(self.content) == 0):
                    raise Exception

                # assign meta keywords (str) and split it to form a list
                self.meta_keywords = goose_article.meta_keywords.split(',')

                # assign meta description
                self.meta_description = goose_article.meta_description

                # top image url
                self.top_img_url = ''

            except Exception as exception:

                highlight_back(
                    "[Crawler] An exception has occured in Goose: {}".format(
                        exception), 'R')
                line_loc()
Example #4
0
    def get_data_wo_user_help(self, user_link_data, no_of_keywords=6):
        """
        This method returns all data without the user giving the keywords

        :type user_link_data: dict
        :param user_link_data: the data retreived from the users link including titlesc contents etc

        :type no_of_keywords: int
        :param no_of_keywords: the number of keywords for AND operation

        """
        # if true let the user enter the keywords
        ask_keywords_from_user = False

        # get title from the users link
        user_link_title = user_link_data['title']

        # get description from the users link
        user_link_description = user_link_data['description']

        # get the content from users link
        user_link_content = user_link_data['content']

        # get the meta keywords
        user_link_meta_keywords = user_link_data['meta_keywords']

        # link to the top image
        user_link_top_img_url = user_link_data['top_img_URL']

        # url of the users link
        user_link_URL = user_link_data['users_link']

        line_loc()
        print(user_link_title)
        print(user_link_description)
        print(user_link_content)
        print(user_link_meta_keywords)
        print(user_link_top_img_url)
        line_loc()

        # KEYWORD MANAGEMENT STARTS HERE--------------------

        # initialize empty keywords
        keywords = []

        # initialize empty entity keywords
        eir_keywords = []

        # KeyWordCheck contains methods for processing keywords
        keyword_manager = KeyWordCheck()

        # remove duplicated and sort the user meta keywords according to the length
        intermediate_keywords = sorted(list(set(user_link_meta_keywords)),
                                       key=len)

        if ((len(intermediate_keywords) <= 1)):

            # recoginze keywords from users title and the description and add them and remove duplicates
            no_meta_eir_keywords = list(
                set(
                    keyword_manager.eir_keywords(user_link_title) +
                    keyword_manager.eir_keywords(user_link_description)))

            print(no_meta_eir_keywords)

            # sort and lower case keywords
            no_meta_eir_keywords = keyword_manager.keyword_formatter(
                no_meta_eir_keywords)

            print(no_meta_eir_keywords)

            # reduce similar keywords
            no_meta_eir_keywords = keyword_manager.keyword_reducer(
                no_meta_eir_keywords)

            print(no_meta_eir_keywords)

            # remove irrevelant keywords
            no_meta_eir_keywords = keyword_manager.remove_irrelevant_keywords(
                no_meta_eir_keywords)

            print(no_meta_eir_keywords)

            print("HERE 1")
            line_loc()

            # check if the number of keywords is greater than 1 but less than the number of keywords
            if (len(no_meta_eir_keywords) > 1
                    and len(no_meta_eir_keywords) <= no_of_keywords):

                keywords = no_meta_eir_keywords
                print("HERE 2")
                line_loc()

            elif (len(no_meta_eir_keywords) == no_of_keywords + 1):

                no_meta_eir_keywords.pop(-1)
                # pop the last keyword
                keywords = no_meta_eir_keywords
                print("HERE 3")
                line_loc()

            elif (len(no_meta_eir_keywords) == no_of_keywords + 2):

                no_meta_eir_keywords.pop(-1)
                no_meta_eir_keywords.pop(-1)

                keywords = no_meta_eir_keywords
                print("HERE 4")
                line_loc()

            else:
                # skip the current method and ask the user for keywords
                ask_keywords_from_user = True

                highlight_back("Asking the user for keywords", 'R')

        # check if the length of meta keywords are greater than or equal to 2 but less than  no_of_keywords
        elif (len(intermediate_keywords) >= 2
              and len(intermediate_keywords) <= no_of_keywords):

            keywords = intermediate_keywords

            print("HERE 5")
            line_loc()

        elif (len(intermediate_keywords) == no_of_keywords + 1):

            intermediate_keywords.pop(-1)
            keywords = intermediate_keywords

            print("HERE 6")
            line_loc()

        elif (len(intermediate_keywords) == no_of_keywords + 2):

            intermediate_keywords.pop(-1)
            intermediate_keywords.pop(-1)
            keywords = intermediate_keywords
            print(keywords)
            print("HERE 7")
            line_loc()
        else:

            # lower case and sort keywords
            intermediate_keywords = keyword_manager.keyword_formatter(
                intermediate_keywords)

            # remove duplicate keywords
            intermediate_keywords = keyword_manager.keyword_reducer(
                intermediate_keywords)

            # remove irrevelent keywords
            intermediate_keywords = keyword_manager.remove_irrelevant_keywords(
                intermediate_keywords)

            if (len(intermediate_keywords) > 1
                    and len(intermediate_keywords) <= no_of_keywords):

                keywords = intermediate_keywords

                print("HERE 8")
                line_loc()

            elif (len(intermediate_keywords) == no_of_keywords + 1):

                intermediate_keywords.pop(-1)
                keywords = intermediate_keywords

                print("HERE 9")
                line_loc()

            elif (len(intermediate_keywords) == no_of_keywords + 2):

                intermediate_keywords.pop(-1)
                intermediate_keywords.pop(-1)
                keywords = intermediate_keywords

                print("HERE 10")
                line_loc()

            else:
                # use named entities in description and description as keywords
                eir_keywords = list(
                    set(
                        keyword_manager.eir_keywords(user_link_title) +
                        keyword_manager.eir_keywords(user_link_description)))
                print(eir_keywords)
                # lower case
                eir_keywords = keyword_manager.keyword_formatter(eir_keywords)
                print(eir_keywords)
                # remove duplicates
                eir_keywords = keyword_manager.keyword_reducer(eir_keywords)
                print(eir_keywords)
                # remove irrevelent
                eir_keywords = keyword_manager.remove_irrelevant_keywords(
                    eir_keywords)
                print(eir_keywords)
                line_loc()

                if (len(eir_keywords) > 1
                        and len(eir_keywords) <= no_of_keywords):

                    keywords = eir_keywords

                    print("HERE 11")
                    line_loc()

                elif (len(eir_keywords) == no_of_keywords + 1):

                    eir_keywords.pop(-1)
                    keywords = eir_keywords

                    print("HERE 12")
                    line_loc()

                elif (len(eir_keywords) == no_of_keywords + 2):

                    eir_keywords.pop(-1)
                    eir_keywords.pop(-1)
                    keywords = eir_keywords

                    print("HERE 13")
                    line_loc()

                else:
                    # apply intersection between keywords in the description and keywords in the meta tag
                    eir_intersection_keywords = keyword_manager.eir_intersection_reduction(
                        user_link_description, user_link_meta_keywords)
                    print(eir_intersection_keywords)

                    # lower case and sort
                    eir_intersection_keywords = keyword_manager.keyword_formatter(
                        eir_intersection_keywords)
                    print(eir_intersection_keywords)

                    # remove duplicates
                    eir_intersection_keywords = keyword_manager.keyword_reducer(
                        eir_intersection_keywords)
                    print(eir_intersection_keywords)

                    # remove irrevelent
                    eir_intersection_keywords = keyword_manager.remove_irrelevant_keywords(
                        eir_intersection_keywords)
                    print(eir_intersection_keywords)
                    line_loc()

                    if (len(eir_intersection_keywords) <= 1):

                        ask_keywords_from_user = True

                        print("HERE 14")
                        line_loc()
                        highlight_back("Asking the user for keywords", 'R')

                    elif (len(eir_intersection_keywords) > 1
                          and len(eir_intersection_keywords) < no_of_keywords):

                        keywords = eir_intersection_keywords

                        print("HERE 15")
                        line_loc()

                    elif (len(eir_intersection_keywords) ==
                          no_meta_eir_keywords + 1):

                        eir_intersection_keywords.pop(-1)
                        keywords = eir_intersection_keywords

                        print("HERE 16")
                        line_loc()

                    elif (len(eir_intersection_keywords) ==
                          no_meta_eir_keywords + 2):

                        eir_intersection_keywords.pop(-1)
                        eir_intersection_keywords.pop(-1)
                        keywords = eir_intersection_keywords

                        print("HERE 17")
                        line_loc()

                    else:

                        ask_keywords_from_user = True

                        print("HERE 18")
                        line_loc()
                        highlight_back("Asking the user for keywords", 'R')

        print(keywords)
        line_loc()

        # KEYWORD MANAGEMENT ENDS HERE-------------------

        if (ask_keywords_from_user == True):

            # return failed status and ask the user to give the keywords
            return {
                "status":
                "fail",
                "suggestions":
                sorted(list(
                    set(user_link_meta_keywords +
                        keyword_manager.eir_keywords(user_link_description))),
                       key=len),
                "users_link":
                user_link_URL
            }

        else:

            api_news_handler = NewsApiHandle(API_Key=self.API_KEY,
                                             keyword_list=keywords)

            # initialize empty list for api content
            api_news_contents = []

            # get titles
            api_news_titles = api_news_handler.get_titles()

            # get sources
            all_news_sources = api_news_handler.get_sources()

            # get URLs
            api_news_Urls = api_news_handler.get_URLs()

            # get descriptions
            api_news_descriptons = api_news_handler.get_descriptions()

            # get only the content from the given urls
            api_news_crawler = ContentCrawler()

            # extract content from each URL and append
            for url in api_news_Urls:

                api_news_contents.append(api_news_crawler.extract_content(url))

            all_news_titles = api_news_titles

            all_news_descriptions = api_news_descriptons

            all_news_contents = api_news_contents

            # insert users news title on to the first element
            all_news_titles.insert(0, user_link_title)

            all_news_descriptions.insert(0, user_link_description)

            all_news_contents.insert(0, user_link_content)

            return ({
                "status":
                "success",
                "sources":
                all_news_sources,
                "titles":
                all_news_titles,
                "descriptions":
                all_news_descriptions,
                "contents":
                all_news_contents,
                "suggestions":
                sorted(list(
                    set(user_link_meta_keywords +
                        keyword_manager.eir_keywords(user_link_description))),
                       key=len),
                "top_img_URL":
                user_link_top_img_url,
                "users_link":
                user_link_URL,
                "api_news_urls":
                api_news_Urls
            })
Example #5
0
    def get_all_data(self, URL, no_of_keywords=6):
        """
        This method returns all the data in the form of a python dictionary which can be converted inti json
        passing the parameter ['all'] in the returns all of the information
        passing the parameters like ['titles'], ['descriptions'], ['contents'] return that specific data

        This method should only be used while testing in cli mode

        :type URL: string
        :param URL: the url of the website

        :type no_of_keywords: int
        :param no_of_keywords : the number of keywords considered
        """
        # initialize Crawler
        user_news_crawler = Crawler(URL)

        # get data from the user provided link
        user_news_content = user_news_crawler.get_content()

        user_news_title = user_news_crawler.get_title()

        user_news_meta_keywords = user_news_crawler.get_meta_keywords()

        user_news_meta_description = user_news_crawler.get_meta_description()

        #--------------------------------------------------
        line_loc()
        print(user_news_meta_keywords)
        print(user_news_meta_description)
        print(user_news_title)
        line_loc()
        #---------------------------------------------------
        #-------------keyword processing START--------------

        # use to get unique kewords to query the api
        keywords_manager = KeyWordCheck()

        # initialize an empty list
        keywords = []

        intermediate_keywords = sorted((list(set(user_news_meta_keywords))),
                                       key=len)

        # for named enitities as keywords
        eir_keywords = []

        # if the length of the meta keywords is less tha or equal to 2
        if (len(user_news_meta_keywords) <= 1):
            no_meta_eir_keywords = list(
                set(
                    keywords_manager.eir_keywords(user_news_meta_description) +
                    keywords_manager.eir_keywords(user_news_title)))
            # lower case and sort
            print(no_meta_eir_keywords)
            no_meta_eir_keywords = keywords_manager.keyword_formatter(
                no_meta_eir_keywords)
            print(no_meta_eir_keywords)
            # reduce smilar keywords
            no_meta_eir_keywords = keywords_manager.keyword_reducer(
                no_meta_eir_keywords)
            print(no_meta_eir_keywords)
            # remove irrelevent
            no_meta_eir_keywords = keywords_manager.remove_irrelevant_keywords(
                no_meta_eir_keywords)
            print("here 13")
            line_loc()
            print(no_meta_eir_keywords)
            # check if there are more than one keywords but less than 3
            if (len(no_meta_eir_keywords) > 1
                    and len(no_meta_eir_keywords) <= no_of_keywords):
                keywords = no_meta_eir_keywords
                print("here 14")
                line_loc()
            elif (len(no_meta_eir_keywords) == (no_of_keywords + 1)):
                no_meta_eir_keywords.pop(-1)
                keywords = no_meta_eir_keywords
                print("here 15")
                line_loc()
            elif (len(no_meta_eir_keywords) == (no_of_keywords + 2)):
                no_meta_eir_keywords.pop(-1)
                no_meta_eir_keywords.pop(-1)
                keywords = no_meta_eir_keywords
                print("here 16")
                line_loc()
            else:
                print("here 17")
                highlight_back(
                    "There was a problem while extracting the keywords", 'R')
                highlight_fore(
                    "Please input unique keywords relevent to the article separated by ','",
                    "B")
                highlight_fore("Suggested Keywords: ", 'Y')
                highlight_fore(list(set(no_meta_eir_keywords)), 'G')
                # get keywords from the user
                keywords = input().split(',')
                line_loc()
        elif (len(intermediate_keywords) >= 2
              and len(intermediate_keywords) <= no_of_keywords):
            # assign meta keywords to keywords
            keywords = intermediate_keywords
            print("here 1")
            line_loc()
        elif (len(intermediate_keywords) == (no_of_keywords + 1)):
            # pop the last element from the list
            intermediate_keywords.pop(-1)
            keywords = intermediate_keywords
            print("here 2")
            line_loc()
        elif (len(intermediate_keywords) == (no_of_keywords + 2)):
            intermediate_keywords.pop(-1)
            intermediate_keywords.pop(-1)
            keywords = intermediate_keywords
            print("here 3")
            line_loc()
        else:
            # lower case the meta keywords and sort them according to their length
            intermediate_keywords = keywords_manager.keyword_formatter(
                intermediate_keywords)
            # reduce the number of keywords
            intermediate_keywords = keywords_manager.keyword_reducer(
                intermediate_keywords)
            # remove irrevelent keywords which are put for seo purposes
            intermediate_keywords = keywords_manager.remove_irrelevant_keywords(
                intermediate_keywords)
            # check if now keywords are less than or equal to 4
            if (len(intermediate_keywords) <= no_of_keywords):
                keywords = intermediate_keywords
                print("here 4")
                line_loc()
            elif (len(intermediate_keywords) == (no_of_keywords + 1)):
                intermediate_keywords.pop(-1)
                keywords = intermediate_keywords
                print("here 5")
                line_loc()
            elif (len(intermediate_keywords) == (no_of_keywords + 2)):
                intermediate_keywords.pop(-1)
                intermediate_keywords.pop(-1)
                keywords = intermediate_keywords
                print("here 6")
                line_loc()
            else:
                # import named entities from the description and the title as keywords
                eir_keywords = list(
                    set(
                        keywords_manager.eir_keywords(
                            user_news_meta_description) +
                        keywords_manager.eir_keywords(user_news_title)))
                # lower case and sort
                print(eir_keywords)
                eir_keywords = keywords_manager.keyword_formatter(eir_keywords)
                print(eir_keywords)
                # reduce smilar keywords
                eir_keywords = keywords_manager.keyword_reducer(eir_keywords)
                print(eir_keywords)
                # remove irrelevent
                eir_keywords = keywords_manager.remove_irrelevant_keywords(
                    eir_keywords)
                print("here 7")
                line_loc()
                print(eir_keywords)
                # check if there are more than one keywords but less than 3
                if (len(eir_keywords) > 1
                        and len(eir_keywords) <= no_of_keywords):
                    keywords = eir_keywords
                    print("here 8")
                    line_loc()
                elif (len(eir_keywords) == (no_of_keywords + 1)):
                    eir_keywords.pop(-1)
                    keywords = eir_keywords
                    print("here 9")
                    line_loc()
                elif (len(eir_keywords) == (no_of_keywords + 2)):
                    eir_keywords.pop(-1)
                    eir_keywords.pop(-1)
                    keywords = eir_keywords
                    print("here 10")
                    line_loc()
                else:
                    # perform intersection between named entities and meta keywords
                    keywords = keywords_manager.eir_intersection_reduction(
                        user_news_meta_description, user_news_meta_keywords)
                    print("here 11")
                    line_loc()
                    if (len(keywords) <= 1):
                        print("here 12")
                        line_loc()
                        highlight_back(
                            "There was a problem while extracting the keywords",
                            'R')
                        highlight_fore(
                            "Please input unique keywords relevent to the article separated by ','",
                            "B")
                        highlight_fore("Suggested Keywords: ", 'Y')
                        highlight_fore(
                            list(set(intermediate_keywords + eir_keywords)),
                            'G')
                        # get keywords from the user
                        keywords = input().split(',')
                    elif (len(keywords) > no_of_keywords):
                        print("here 12")
                        line_loc()
                        highlight_back(
                            "There was a problem while extracting the keywords",
                            'R')
                        highlight_fore(
                            "Please input unique keywords relevent to the article separated by ','",
                            "B")
                        highlight_fore("Suggested Keywords: ", 'Y')
                        highlight_fore(
                            list(set(intermediate_keywords + eir_keywords)),
                            'G')
                        # get keywords from the user
                        keywords = input().split(',')

        #-------------keyword processing END--------------
        print(keywords)

        #--------------------News api related stuff START--------------------

        #initialize empty list for the content
        api_news_contents = []

        # initialize the client with an api key
        api_news_handler = NewsApiHandle(API_Key=self.API_KEY,
                                         keyword_list=keywords)

        api_news_titles = api_news_handler.get_titles()

        api_news_Urls = api_news_handler.get_URLs()

        api_news_descriptons = api_news_handler.get_descriptions()

        all_news_sources = api_news_handler.get_sources()

        # get only the content from the given Urls
        api_news_crawler = ContentCrawler()

        for url in api_news_Urls:

            api_news_contents.append(api_news_crawler.extract_content(url))

        all_news_titles = api_news_titles

        all_news_descriptions = api_news_descriptons

        all_news_content = api_news_contents

        #--------------------News api related stuff END--------------------

        # insert the information from the user at the first of the index
        all_news_titles.insert(0, user_news_title)

        all_news_descriptions.insert(0, user_news_meta_description)

        all_news_content.insert(0, user_news_content)

        # return all data as dictionary and later can be converted into a json object
        return {
            "all": [
                all_news_sources, all_news_titles, all_news_descriptions,
                all_news_content
            ],
            "sources":
            all_news_sources,
            "titles":
            all_news_titles,
            "descriptons":
            all_news_descriptions,
            "contents":
            all_news_content
        }
Example #6
0
from fake_news.preprocessor.error_handle import line_loc, highlight_back
from abstractor2 import with_keywords, without_keywords
from flask import Flask, jsonify, request

API_KEY = ""

if (len(API_KEY) <= 1):
    highlight_back("ENTER YOUR API KEY ABOVE ON LINE 6 OF THIS FILE", "R")
    line_loc()

app = Flask(__name__)


@app.route('/api/without', methods=['GET', 'POST'])
def without_k():

    if (request.method == 'POST'):

        url = request.get_json()['users_link']

        return jsonify(without_keywords(url, API_KEY))


@app.route('/api/with', methods=['GET', 'POST'])
def with_k():

    if (request.method == 'POST'):

        url_and_keywords = request.get_json()

        url = url_and_keywords['users_link']