Esempio n. 1
0
def get_important(splitted: List[str]):
    print("split", splitted)
    freq_r = rake_nltk.Rake(max_length=3, ranking_metric=Metric.WORD_FREQUENCY)
    freq_r.extract_keywords_from_text(" and ".join(splitted))
    freq_phrases = freq_r.get_ranked_phrases()

    deg_r = rake_nltk.Rake(max_length=3, ranking_metric=Metric.WORD_DEGREE)
    deg_r.extract_keywords_from_text(" ".join(splitted))
    deg_phrases = deg_r.get_ranked_phrases()

    freq_data = {i: find_freq(freq_phrases, i.lower()) for i in splitted}
    deg_data = {i: find_deg(deg_phrases, i.lower()) for i in splitted}
    data = {}
    for k, freq_v in freq_data.items():
        deg_v = deg_data[k]
        data[k] = freq_v + deg_v
    part_of_speech = nltk.pos_tag(splitted)
    print(part_of_speech)
    for i, part in part_of_speech:
        if part not in good:
            data[i] *= 0.5
        if i.lower() in [
                'thing', 'things', 'yeah', 'yes', 'something', 'actually',
                'really', 'life', 'lives', 'lot', 'ok', 'oh', 'well', 'stuff',
                'talking', 'look', 'looking', 'talking', 'isn\'t', 'that\'s',
                'right'
        ]:
            data[i] *= 0.1
    """
FW foreign word
JJ adjective 'big'
JJR adjective, comparative 'bigger'
JJS adjective, superlative 'biggest'
NN noun, singular 'desk'
NNS noun plural 'desks'
NNP proper noun, singular 'Harrison'
NNPS proper noun, plural 'Americans'
RB adverb very, silently,
RBR adverb, comparative better
RBS adverb, superlative best
RP particle give up
VB verb, base form take
VBD verb, past tense took
VBG verb, gerund/present participle taking
VBN verb, past participle taken
VBZ verb, 3rd person sing. present takes
WP wh-pronoun who, what
WRB wh-abverb where, when
"""
    # print("deg_phrases",deg_phrases)
    # print("deg_data",deg_data)
    # print("freq_phrases",freq_phrases)
    # print("freq_data",freq_data)
    return [j[0] for j in sorted(data.items(), key=lambda i: -i[1])[:3]]
Esempio n. 2
0
def register_influencer():

    # Extract keywords and add compat

    text = request.args['text'].lower()
    r = rake_nltk.Rake()
    r.extract_keywords_from_text(text)
    influencer_tags_set = set(list(r.get_ranked_phrases()))
    most_used_word_tag = list(r.get_ranked_phrases_with_scores())[0]

    print(influencer_tags_set)

    db.collection('influencers').document(request.args['influencer_id']).set({
        'tags': list(influencer_tags_set),
        'most_used_word': str(most_used_word_tag)
    }, merge=True)

    for brand_doc_ref in db.collection('brands').stream():
        brand_tags_set = set(list(brand_doc_ref.to_dict()['tags']))

        db \
            .collection('compat') \
            .document(brand_doc_ref.id + "&" + request.args['influencer_id']) \
            .set({
                'common_tags': list(brand_tags_set.intersection(influencer_tags_set)),
                'score': 100
                        if len(influencer_tags_set.intersection(brand_tags_set)) == len(brand_tags_set)
                        else len(influencer_tags_set.intersection(brand_tags_set)) * 100 / len(brand_tags_set)
            })

    return jsonify({'tags_extracted': str(influencer_tags_set), 'user': '******', 'most_used_word': most_used_word_tag})
Esempio n. 3
0
def getMetadataSoup(metadata):
    m = metadata
    m['genre'] = metadata['genre'].map(lambda x: x.lower().split(' '))
    m['type_name'] = metadata['type_name'].map(lambda x: x.lower().split(' '))
    m['title'] = metadata['title'].map(lambda x: x.lower().split(' '))
    m['author'] = metadata['author'].map(lambda x: x.lower().split(' '))

    m['description_keywords'] = ""

    for i in m.index:
        description = m.at[i, 'description']
        r = rake_nltk.Rake()
        r.extract_keywords_from_text(description)
        keywords_dict_score = r.get_word_degrees()
        m.at[i, 'description_keywords'] = list(keywords_dict_score.keys())

    m.drop(columns=['description'], inplace=True)
    m.set_index('item_id', inplace=True)
    m['soup'] = ''
    columns = m.columns
    for index, rows in m.iterrows():
        words = ''
        for col in columns:
            words = words + ' '.join(rows[col]) + ' '
        rows['soup'] = words

    m.drop(columns=[col for col in m.columns if col != 'soup'], inplace=True)

    return metadata
Esempio n. 4
0
def get_keywords(text, cutoff_score=None, limit=None):
    rake = rake_nltk.Rake()
    rake.extract_keywords_from_text(text)
    score_words = rake.get_ranked_phrases_with_scores()
    if limit is not None:
        score_words = score_words[:limit]
    if cutoff_score is not None:
        score_words = filter(lambda score_word: score_word[0] >= cutoff_score,
                             score_words)
    return [score_word[1] for score_word in score_words]
Esempio n. 5
0
def get_rake_keywords(text, max_length=100000, metric=None):
    rake = rake_nltk.Rake(
        language='german',
        max_length=max_length,
        stopwords=stopwords_list,
        ranking_metric=rake_nltk.Metric.DEGREE_TO_FREQUENCY_RATIO)
    rake.extract_keywords_from_text(text)
    #if not keywords_number:
    #    keywords_number = len(sentences)//3
    return rake.get_ranked_phrases()
def keyword_extraction(text):
    stoppath = 'data/stoplists/SmartStoplist.txt'
    rake_object = rake.Rake(stoppath, 5, 3, 4)
    keywords = rake_object.run(text)
    final_list = list()
    for key in keywords:
        if (key[1] >= 4.5):
            final_list.append(key[0])
            print(key[0])
        else:
            break
    return final_list
Esempio n. 7
0
    def __init__(self, algorithm: str, **kwargs) -> None:
        """
        Keyword Extractor constructor.

        Parameters
        ----------
        algorithm : str
            The algorithm used to extract keywords. Supported algorithms: `rake` and `yake`.
        **kwargs: keyword arguments passed to the keyword extraction algorithm.

        Raises
        ------
        ImportError
            If the keyword extraction algorithm is not installed.
        ValueError
            If the specified extraction algorithm is not supported.
        """
        if algorithm == "rake":
            try:
                import rake_nltk
            except ImportError:
                print(
                    "Problem occured while trying to import rake-nltk. "
                    "If the library is not installed visit "
                    "https://csurfer.github.io/rake-nltk/_build/html/index.html "
                    "for more details."
                )
                raise

            extractor = rake_nltk.Rake(**kwargs)
        elif algorithm == "yake":
            try:
                import yake
            except ImportError:
                print(
                    "Problem occured while trying to import yake. "
                    "If the library is not installed visit "
                    "https://github.com/LIAAD/yake for more details."
                )
                raise

            extractor = yake.KeywordExtractor(**kwargs)

        else:
            raise ValueError(
                f"{algorithm} is not supported as keyword extraction algorithm. "
                f"Available algorithms: {['rake', 'yake']}"
            )

        self._algorithm = algorithm
        self._kw_extractor = extractor
Esempio n. 8
0
 def __init__(self):
     self.rakeObj = rake_nltk.Rake()
     self.rakeDict = {}
     self.ranking = None
     self.wordList = None
     self.punct = set(string.punctuation)
     #self.snowStemmer = nltk.stem.snowball.SnowballStemmer("english")
     self.stopwords = set(nltk.corpus.stopwords.words('english'))
     self.chunkParser = nltk.RegexpParser(
         r"""
             NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
             PP: {<IN><NP>}               # Chunk prepositions followed by NP
             VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
             CLAUSE: {<NP><VP>}           # Chunk NP, VP
         """)
Esempio n. 9
0
def register_brand():
    text = request.args['text'].lower()
    r = rake_nltk.Rake()
    r.extract_keywords_from_text(text)
    brands_tags_set = set(list(r.get_ranked_phrases()))
    most_used_word_tag = list(r.get_ranked_phrases_with_scores())[0]

    print("Tags extracted from the description", brands_tags_set)

    db.collection('brands') \
        .document(request.args['brand_id']) \
        .set({
            'tags': list(brands_tags_set),
            'most_used_word': str(most_used_word_tag)
        }, merge=True)

    return jsonify({'tags_extracted': str(brands_tags_set), 'user': '******', 'most_used_word': most_used_word_tag})
Esempio n. 10
0
 def score(self, text: str, args: list) -> float:
     source_text = args[0]
     #checks if this is a rephrasing example
     process_parity = args[1]
     if not process_parity:
         return 0.0
     r = rake_nltk.Rake(max_length=3)
     r.extract_keywords_from_text(source_text)
     # getting top 10 key words/phrases
     keyword_max = round(0.05 * len(nltk.word_tokenize(source_text)))
     phrases_original = r.get_ranked_phrases()[:keyword_max]
     r.extract_keywords_from_text(text)
     phrases_text = r.get_ranked_phrases(
     )[:round(0.10 * len(nltk.word_tokenize(text)))]
     #count how many of the ranked phrases appear as keywords in the user's text
     phrase_count = 0
     for phrase in phrases_original:
         if phrase in phrases_text:
             phrase_count += 1
     return phrase_count / keyword_max
Esempio n. 11
0
def keywords_rake(text_file, num_keywords):

    # Read text file.
    with open(text_file, 'r') as f:
        text = f.read()

    rake = rake_nltk.Rake()

    # Extraction given the text.
    rake.extract_keywords_from_text(text)

    def no_numbers(phrase):
        for char in phrase:
            if char.isdigit():
                return False
        return True

    # To get keyword phrases ranked highest to lowest with scores.
    return [
        (keyword, np.exp(-score))
        for (score,
             keyword) in rake.get_ranked_phrases_with_scores()[:num_keywords]
        if no_numbers(keyword)
    ]
Esempio n. 12
0
 OM_Func = lambda line, lib: line[lib['app_installs']] * (6.0 - line[lib[
     'app_score']]) * (line[lib['app_price']] + 0.43)
 Like_Func = lambda line, lib: line[lib['app_score']]
 # -------------------------------------------------------------------------------------------------------------------
 #                                       END OF Data Predicting Selection Stage
 # -------------------------------------------------------------------------------------------------------------------
 #Get rid of un-wanted columns and calculate the scores
 OM_Input, OM_Targets = formatForMeasure(
     globalDat, globalAttributes,
     ['app_category_primary', 'app_category_secondary'], Like_Func)
 #expand catagories to be mutually independant.
 formatted = catagoryExpand(OM_Input)
 #Get text features
 text = [line[attrLib['app_description']] for line in globalDat]
 #raking parses through and gets the words that seem the most important in the sentence.
 raker = rake_nltk.Rake(max_length=2)
 for num in range(len(text)):
     raker.extract_keywords_from_text(text[num])
     text[num] = raker.get_ranked_phrases_with_scores()
 #text = [raker.extract_keywords_from_text(desc).get_ranked_phrases_with_scores() for desc in text] #(rank, word)
 snowy = SnowballStemmer(
     "english")  #stem is like training -> train. It helps simplify
 text = [[(tuppy[0], snowy.stem(tuppy[1].lower())) for tuppy in desc]
         for desc in text]  #stem all the words
 wordBucket = {}
 #store the 'worth' of the features, which I am determining as a function
 #of relevance and frequency.
 for desc in text:
     for tuppy in desc:
         if wordBucket.get(tuppy[1]) == None:
             wordBucket[tuppy[1]] = tuppy[0]
    def __init__(self,
                 classifier_dims,
                 num_classes,
                 embedding_dims,
                 gaussian_noise,
                 dropout,
                 internal_dims,
                 n_layers,
                 featurizer,
                 final_layer_builder,
                 n_tokens_in=64,
                 n_tokens_out=16,
                 capabilities2dims=dict(),
                 use_as_super=False,
                 **kwargs):
        super(LangFeaturesModel, self).__init__(classifier_dims,
                                                num_classes,
                                                embedding_dims,
                                                gaussian_noise,
                                                dropout,
                                                internal_dims,
                                                n_layers,
                                                featurizer,
                                                final_layer_builder,
                                                n_tokens_in,
                                                n_tokens_out,
                                                use_as_super=True,
                                                **kwargs)
        assert "capabilities" in kwargs
        capabilities = kwargs["capabilities"]
        kwargs[
            "rake_dims"] = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
        kwargs[
            "yake_dims"] = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
        assert "key_phrases" not in capabilities or (
            "key_phrases" in capabilities and "spacy" in capabilities)
        use_layer_norm = kwargs[
            "use_layer_norm"] if "use_layer_norm" in kwargs else False
        self.capabilities = capabilities
        embedding_dim = 8
        cap_to_dim_map = {
            "spacy": 128,
            "snlp": 32,
            "key_phrases": 64,
            "nltk": 192,
            "full_view": 64,
            "tmoji": 32,
            "ibm_max": 16,
            "gensim": 256,
            "fasttext_crawl": 256
        }
        cap_to_dim_map.update(capabilities2dims)
        all_dims = sum([cap_to_dim_map[c] for c in capabilities])
        self.cap_to_dim_map = cap_to_dim_map
        self.all_dims = all_dims

        if "spacy" in capabilities:
            tr = pytextrank.TextRank(token_lookback=7)
            self.nlp = spacy.load("en_core_web_lg", disable=[])
            self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
            spacy_in_dims = (96 * 2) + (11 * embedding_dim) + 2
            self.spacy_nn = ExpandContract(spacy_in_dims,
                                           cap_to_dim_map["spacy"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(2, 4))

        if "fasttext_crawl" in capabilities:
            self.bpe = BPEmb(dim=200)
            self.cngram = CharNGram()
            fasttext_crawl_file = kwargs[
                "fasttext_crawl_file"] if "fasttext_crawl_file" in kwargs else "crawl-300d-2M-subword.bin"
            self.crawl = fasttext.load_model(fasttext_crawl_file)
            self.crawl_nn = ExpandContract(200 + 300 + 100,
                                           cap_to_dim_map["fasttext_crawl"],
                                           dropout,
                                           use_layer_norm=use_layer_norm,
                                           groups=(4, 4))

        if "gensim" in capabilities:
            gensim = [
                api.load("glove-twitter-50"),
                api.load("glove-wiki-gigaword-50"),
                api.load("word2vec-google-news-300"),
                api.load("conceptnet-numberbatch-17-06-300")
            ]
            self.gensim = gensim
            self.gensim_nn = ExpandContract(400,
                                            cap_to_dim_map["gensim"],
                                            dropout,
                                            use_layer_norm=use_layer_norm,
                                            groups=(4, 4))

        if "full_view" in capabilities:
            full_sent_in_dims = 300
            self.full_sent_nn = ExpandContract(full_sent_in_dims,
                                               cap_to_dim_map["full_view"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        if "snlp" in capabilities:
            import stanza
            self.snlp = stanza.Pipeline(
                'en',
                processors='tokenize,pos,lemma,depparse,ner',
                use_gpu=False,
                pos_batch_size=2048)
            self.snlp_nn = ExpandContract(embedding_dim * 5,
                                          cap_to_dim_map["snlp"],
                                          dropout,
                                          use_layer_norm=use_layer_norm)
        if "key_phrases" in capabilities:
            import yake
            self.kw_extractor = yake.KeywordExtractor(lan="en",
                                                      n=3,
                                                      dedupLim=0.9,
                                                      dedupFunc='seqm',
                                                      windowsSize=3,
                                                      top=10,
                                                      features=None)

            self.key_occ_cnt_pytextrank = nn.Embedding(8, embedding_dim)
            nn.init.normal_(self.key_occ_cnt_pytextrank.weight,
                            std=1 / embedding_dim)
            self.key_wc_pytextrank = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_pytextrank.weight,
                            std=1 / embedding_dim)

            yake_dims = kwargs["yake_dims"] if "yake_dims" in kwargs else 32
            self.yake_dims = yake_dims
            self.yake_nn = ExpandContract(300,
                                          yake_dims,
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 2))

            try:
                from multi_rake import Rake
                rake_dims = kwargs["rake_dims"] if "rake_dims" in kwargs else 32
                self.rake_dims = rake_dims
                self.rake_nn = ExpandContract(300,
                                              rake_dims,
                                              dropout,
                                              use_layer_norm=use_layer_norm,
                                              groups=(2, 2))
                self.rake = Rake(language_code="en")
                keyphrases_dim = 2 * embedding_dim + rake_dims + yake_dims
            except:
                self.rake = None
                keyphrases_dim = 2 * embedding_dim + yake_dims
            self.keyphrase_nn = ExpandContract(keyphrases_dim,
                                               cap_to_dim_map["key_phrases"],
                                               dropout,
                                               use_layer_norm=use_layer_norm,
                                               groups=(4, 4))

        fasttext_file = kwargs[
            "fasttext_file"] if "fasttext_file" in kwargs else "wiki-news-300d-1M-subword.bin"
        if not set(capabilities).isdisjoint(
            {"key_phrases", "full_view", "nltk"}):
            self.text_model = fasttext.load_model(fasttext_file)

        self.pdict = get_all_tags()
        self.tag_em = nn.Embedding(len(self.pdict) + 1, embedding_dim)
        nn.init.normal_(self.tag_em.weight, std=1 / embedding_dim)

        self.sw_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sw_em.weight, std=1 / embedding_dim)

        self.sent_start_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.sent_start_em.weight, std=1 / embedding_dim)

        self.is_oov_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_oov_em.weight, std=1 / embedding_dim)

        self.has_digit_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.has_digit_em.weight, std=1 / embedding_dim)

        self.is_mask_em = nn.Embedding(2, embedding_dim)
        nn.init.normal_(self.is_mask_em.weight, std=1 / embedding_dim)

        self.w_len = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.w_len.weight, std=1 / embedding_dim)

        self.wc_emb = nn.Embedding(16, embedding_dim)
        nn.init.normal_(self.wc_emb.weight, std=1 / embedding_dim)

        if "nltk" in capabilities:
            import rake_nltk
            from textblob import TextBlob
            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentimentIntensityAnalyzer
            self.stop_words = set(stopwords.words('english'))
            self.rake_nltk = rake_nltk.Rake()
            self.key_wc_rake_nltk = nn.Embedding(4, embedding_dim)
            nn.init.normal_(self.key_wc_rake_nltk.weight,
                            std=1 / embedding_dim)
            self.nltk_sid = SentimentIntensityAnalyzer()
            self.vader_sid = VaderSentimentIntensityAnalyzer()
            in_dims = 310 + 5 * embedding_dim
            self.nltk_nn = ExpandContract(in_dims,
                                          cap_to_dim_map["nltk"],
                                          dropout,
                                          use_layer_norm=use_layer_norm,
                                          groups=(2, 4))

        if "ibm_max" in capabilities:
            from ..external import ModelWrapper
            self.ibm_max = ModelWrapper()
            for p in self.ibm_max.model.parameters():
                p.requires_grad = False
            self.ibm_nn = ExpandContract(6,
                                         cap_to_dim_map["ibm_max"],
                                         dropout,
                                         use_layer_norm=use_layer_norm,
                                         groups=(1, 1))

        if "tmoji" in capabilities:
            from torchmoji.sentence_tokenizer import SentenceTokenizer
            from torchmoji.model_def import torchmoji_emojis
            from torchmoji.global_variables import PRETRAINED_PATH, VOCAB_PATH
            with open(VOCAB_PATH, 'r') as f:
                maxlen = self.n_tokens_in
                self.vocabulary = json.load(f)
                self.st = SentenceTokenizer(self.vocabulary, maxlen)
                self.tmoji = torchmoji_emojis(PRETRAINED_PATH)
                for p in self.tmoji.parameters():
                    p.requires_grad = False
            self.tm_nn = ExpandContract(64,
                                        cap_to_dim_map["tmoji"],
                                        dropout,
                                        use_layer_norm=use_layer_norm,
                                        groups=(1, 1))

        self.contract_nn = ExpandContract(self.all_dims,
                                          embedding_dims,
                                          dropout,
                                          use_layer_norm=True,
                                          unit_norm=False,
                                          groups=(4, 4))
        if not use_as_super:
            if featurizer == "cnn":
                self.featurizer = CNN1DFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)
            elif featurizer == "gru":
                self.featurizer = GRUFeaturizer(n_tokens_in, embedding_dims,
                                                n_tokens_out, classifier_dims,
                                                internal_dims, n_layers,
                                                gaussian_noise, dropout)
            elif featurizer == "basic":
                self.featurizer = BasicFeaturizer(n_tokens_in, embedding_dims,
                                                  n_tokens_out,
                                                  classifier_dims,
                                                  internal_dims, n_layers,
                                                  gaussian_noise, dropout)

            elif featurizer == "transformer":
                self.attention_drop_proba = kwargs[
                    "attention_drop_proba"] if "attention_drop_proba" in kwargs else 0.0
                n_encoders = kwargs.pop("n_encoders", n_layers)
                n_decoders = kwargs.pop("n_decoders", n_layers)
                self.featurizer = TransformerFeaturizer(
                    n_tokens_in, embedding_dims, n_tokens_out, classifier_dims,
                    internal_dims, n_encoders, n_decoders, gaussian_noise,
                    dropout, self.attention_drop_proba)
            else:
                raise NotImplementedError()

            self.final_layer = final_layer_builder(classifier_dims,
                                                   n_tokens_out, num_classes,
                                                   dropout, **kwargs)
        if "stored_model" in kwargs:
            load_stored_params(self, kwargs["stored_model"])
        self.reg_layers = get_regularization_layers(self)
Esempio n. 14
0
def get_imp_terms_rake(x):
    r = rake_nltk.Rake()
    r.extract_keywords_from_text(x)
    r.get_ranked_phrases()[0:4]
Esempio n. 15
0
    x = pd.Series(x, index=tfidf_bigrams.get_feature_names())
    x = x.sort_values(ascending=False)
    return x.head(4).index.tolist()


transcripts['imp_terms_bigrams'] = [
    get_imp_terms_bigram(x) for x in matrix_bigrams
]

transcripts['imp_terms_bigrams'] = transcripts['imp_terms_bigrams'].map(
    lambda x: ",".join(x))

## Keywords using RAKE, didn't work
import rake_nltk

r = rake_nltk.Rake()


def get_imp_terms_rake(x):
    r = rake_nltk.Rake()
    r.extract_keywords_from_text(x)
    r.get_ranked_phrases()[0:4]


transcripts['imp_terms_rake'] = [get_imp_terms_rake(x) for x in Text]
transcripts['imp_terms_rake'] = transcripts['imp_terms_rake'].map(
    lambda x: ",".join(x))

transcripts.drop("imp_terms_rake", axis=1, inplace=True)

transcripts.to_csv("transcripts_key_words.csv", index=False, encoding="utf-8")