Ejemplo n.º 1
0
def keybert_keyword_extractor_raw_text(filename, keyphrase_range=(1, 2)):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    word_set = set()
    word_array = list()
    with open(filename, "r", encoding="utf-8") as file:
        try:
            extractor_data = file.readlines()
            keywords = model.extract_keywords(
                extractor_data,
                keyphrase_ngram_range=keyphrase_range,
                stop_words='english')
            for record in keywords:
                for record_part in record:
                    if record_part != 'None Found':
                        for position, part in enumerate(record_part):
                            if position == 0:
                                word_set.add(part)
                            elif position > 1:
                                print("Error: position is greater then 1: ")
                                print(part)
                    elif record_part == 'None Found':
                        # print("Not found")
                        pass
                    else:
                        print("Error: problem occurred record is: ")
                        print(record_part)
            word_array = list(word_set)

        except UnicodeDecodeError:
            print("Cant extract data from file: " + filename)
        except ValueError as e:
            print("Error: Value error: ")
            print(e)
    return word_array
Ejemplo n.º 2
0
 def run(self) -> None:
     keywords_model = KeyBERT("xlm-r-distilroberta-base-paraphrase-v1")
     stop_words = stopwords.words("english")
     while True:
         urls = self.provider.get_records()
         if len(urls) == 0:
             break
         bulk = websites_db.initialize_unordered_bulk_op()
         for document in tqdm(urls, desc="thread", leave=False):
             page_text = document["page_text"].replace("\n", " ").strip()
             summary = document["xl_summary"]
             processed_text = " ".join(document["processed_text"])
             id = document["_id"]
             try:
                 summary_keywords, text_keywords, processed_keywords = keywords_model.extract_keywords(
                     [summary, page_text, processed_text],
                     keyphrase_ngram_range=(2, 2),
                     stop_words=stop_words)
             except Exception as ex:
                 print(ex)
                 continue
             bulk.find({
                 "_id": id
             }).update_one({
                 "$set": {
                     "summary_keywords": summary_keywords,
                     "text_keywords": text_keywords,
                     "processed_keywords": processed_keywords
                 }
             })
         bulk.execute()
Ejemplo n.º 3
0
 def __init__(self, lang):
     if lang == "de":
         self.model = KeyBERT("dbmdz/bert-base-german-uncased")
     elif lang == "en":
         self.model = KeyBERT("paraphrase-MiniLM-L6-v2")
     else:
         raise NotImplementedError()
def extract_candidateterms_keybert_preprocessed(descriptions,
                                                max_ngram,
                                                faster_keybert=False,
                                                verbose=False,
                                                **kwargs):
    from keybert import KeyBERT  # lazily loaded as it needs tensorflow/torch which takes some time to init
    model_name = "paraphrase-MiniLM-L6-v2" if faster_keybert else "paraphrase-mpnet-base-v2"
    print(f"Using model {model_name}")
    candidateterms = []
    kw_model = KeyBERT(model_name)
    descs = descriptions._descriptions if not get_setting(
        "DEBUG") else descriptions._descriptions[:get_setting("DEBUG_N_ITEMS")]
    for desc in tqdm(descs, desc="Running KeyBERT on descriptions"):
        stopwords = get_stopwords(desc.lang)
        candidates = set()
        for nwords in range(1, max_ngram):
            n_candidates = kw_model.extract_keywords(
                desc.processed_as_string(),
                keyphrase_ngram_range=(1, nwords),
                stop_words=stopwords)
            candidates |= set(i[0] for i in n_candidates)
        candidates = list(candidates)
        if (ct := extract_coursetype(desc)) and ct not in candidates:
            candidates += [ct]
        candidateterms.append(candidates)
Ejemplo n.º 5
0
def results():
    # get data
    URLS = ['https://www.binance.com/en', 'http://www.supermap.com']
    ATTRIBUTES = ['description', 'keywords', 'Description', 'Keywords']
    collected_data = []
    res = []
    data = request.form['command']
    # ..............................................
    URLS = [data]
    for url in URLS:
        entry = {'url': url}
        try:
            r = requests.get(url)
        except Exception as e:
            res = 'Could not load page {}. Reason: {}'.format(url, str(e))
            print('Could not load page {}. Reason: {}'.format(url, str(e)))
            return render_template('results.html', predictions=res)
            continue
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, 'html.parser')
            meta_list = soup.find_all("meta")
            for meta in meta_list:
                if 'name' in meta.attrs.keys() and meta.attrs['name'].strip(
                ).lower() in ['description', 'keywords']:
                    name = meta.attrs['name']
                    entry[name.lower()] = meta.attrs['content']
            # if len(entry) == 3:
            collected_data.append(entry)
            # else:
            #     print('Could not find all required attributes for URL {}'.format(url))
            #     res = 'Could not find all required attributes for URL {}'.format(url)
            #     return render_template('results.html',predictions=res)
        else:
            print('Could not load page {}.Reason: {}'.format(
                url, r.status_code))
            res = 'Could not load page {}.Reason: {}'.format(
                url, r.status_code)
            return render_template('results.html', predictions=res)
    print('Collected meta attributes (TODO - push to DB):')
    for entry in collected_data:
        print(entry)
        print("Summary ")

        # Textrank method
        print(keywords(str(entry)).split('\n'))
        print('\n')
        # KeyBERT method
        from keybert import KeyBERT
        model = KeyBERT('distilbert-base-nli-mean-tokens')
        print(
            model.extract_keywords(str(entry),
                                   keyphrase_ngram_range=(1, 2),
                                   stop_words=None))
        print('\n')
        res = model.extract_keywords(str(entry),
                                     keyphrase_ngram_range=(1, 2),
                                     stop_words=None)

    return render_template('results.html', predictions=res)
Ejemplo n.º 6
0
def keybertify(data, range=1):
    data = data
    range = int(range)
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    #model = KeyBERT('distilbert-base-nli-stsb-mean-tokens')
    #model = KeyBERT('xlm-r-distilroberta-base-paraphrase-v1')
    keywords = model.extract_keywords(data, keyphrase_ngram_range=(1, range))
    return keywords
Ejemplo n.º 7
0
def getkeywords_key_bert(name):
    text = getemail(name)
    stopwords = stopwordslist("/home/chenqi/work/openwall/stopwords.txt")
    kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
    keywords = kw_extractor.extract_keywords(text,
                                             keyphrase_ngram_range=(1, 1),
                                             stop_words=stopwords)
    print("Keywords of article", keywords)
Ejemplo n.º 8
0
def make_keywords(dataset):
    kw_model = KeyBERT()
    df = pd.DataFrame(columns=['text', 'keywords'])
    df["text"] = dataset["text"]
    for i in tqdm(range(len(df))):
        keyword = kw_model.extract_keywords(df['text'][i])
        clean = clean_keywords(keyword)
        df["keywords"][i] = clean
    return df
Ejemplo n.º 9
0
def getkeywords_key_bert(text):
    #text = getemail(name)
    stopwords = stopwordslist("stopwords.txt")
    kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
    keywords = kw_extractor.extract_keywords(text,
                                             keyphrase_ngram_range=(1, 1),
                                             stop_words=stopwords,
                                             min_df=1,
                                             use_maxsum=True,
                                             use_mmr=True)
    print("Keywords of article", keywords)
    return keywords
 def __init__(self, is_multilan, faster=False, max_ngram=1):
     """available models: https://github.com/MaartenGr/KeyBERT#25-embedding-models"""
     from keybert import KeyBERT  #lazily loaded as it needs tensorflow which takes some time to init
     assert not (is_multilan and faster)
     if faster:
         self.model_name = "paraphrase-MiniLM-L6-v2"
     elif is_multilan:
         self.model_name = "paraphrase-multilingual-MiniLM-L12-v2"
     else:
         self.model_name = "paraphrase-mpnet-base-v2"
     print(f"Using model {self.model_name}")
     self.kw_model = KeyBERT(self.model_name)
     self.max_ngram = max_ngram
Ejemplo n.º 11
0
class KeybertRepr():
    def __init__(self, lang):
        if lang == "de":
            self.model = KeyBERT("dbmdz/bert-base-german-uncased")
        elif lang == "en":
            self.model = KeyBERT("paraphrase-MiniLM-L6-v2")
        else:
            raise NotImplementedError()

    def get_repr(self, lst, n_shuffles=5, max_ngram=None):
        trials = []
        for n_comb in range(n_shuffles):
            lst = random.sample(lst, len(lst))
            cands = lst
            if max_ngram is not None:
                cands = [i for i in lst if i.count(" ") < max_ngram]
                if not cands: cands = lst
            trials.append(self.model.extract_keywords(". ".join(lst), candidates=cands, top_n=1)[0])
        if len(set(i[0] for i in trials)) == 1:
            return trials[0][0]
        elif (cnt := sorted(Counter([i[0] for i in trials]).values(), reverse=True))[0] > cnt[1]: #if one is detected more often than the others
            return max(Counter([i[0] for i in trials]), key=lambda x: x[1])
        #now: return the one that is most often extracted and then the highest score from that
        trials = [j for j in trials if Counter([i[0] for i in trials])[j[0]] == max(Counter([i[0] for i in trials]).values())]
        return max(trials, key=lambda x:x[1])[0]
Ejemplo n.º 12
0
def load_models():
    """
    Function which loads the english NLP model, and the Keybert model.
    This needs to run once since all models need a few seconds to load.
    """
    return (spacy.load('en_core_web_sm'),
            KeyBERT('distiluse-base-multilingual-cased-v2'))
class Keyword:
    def __init__(self):
        self.model = KeyBERT('xlm-r-distilroberta-base-paraphrase-v1')
        self.textInput = ""
    def setInput(self, textInput):
        self.textInput = textInput
    def getKeyword(self):
        keyword = self.model.extract_keywords(self.textInput, keyphrase_ngram_range=(1, 2), stop_words=None)
        return keyword
Ejemplo n.º 14
0
def keybert_keyword_extractor(filename, keyphrase_range=(1, 2)):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    extractor_data = load_as_json(filename)
    categories = dict()
    result_dict = dict()
    for content in extractor_data:
        if content['category'] not in categories:
            categories[content['category']] = []
            result_dict[content['category']] = []
        categories[content['category']].append(content['text'])
    for category, category_array in categories.items():
        print(category)
        word_set = set()
        for text_from_category in category_array:
            keywords = model.extract_keywords(
                text_from_category,
                keyphrase_ngram_range=keyphrase_range,
                stop_words='english')
            for keyword, value in keywords:
                word_set.add(keyword)
        word_array = list(word_set)
        print(len(word_array))
        result_dict[category] = word_array
    return result_dict
Ejemplo n.º 15
0
def extract_paper_keywords(input_csv, out_csv, keywordCount):
    # record_list = pd.read_csv(input_csv).to_dict(orient='records')
    list_dic = pd.read_csv(input_csv).to_dict(orient='list')
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    # res = []
    res = {}
    # for index, record in enumerate(record_list):
    keyword_list_list = []
    for index, doc in enumerate(list_dic['allTitleAndAbstract']):
        print('-' * 100)
        print("index: " + str(index + 1) + "/" +
              str(len(list_dic['nameWithOwner'])) + ", repo: " +
              str(list_dic['nameWithOwner'][index]))
        # 为了使结果多样化,我们可以使用最大余量相关性(MMR)创建也基于余弦相似度的关键字/关键词。具有高度多样性的结果:
        tuple_list = model.extract_keywords(doc,
                                            keyphrase_ngram_range=(1, 1),
                                            stop_words='english',
                                            top_n=keywordCount,
                                            use_mmr=True,
                                            diversity=0.7)
        # 拿尽量多的关键词
        tmp = keywordCount
        while len(tuple_list) == 0:
            tmp = tmp - 5
            tuple_list = model.extract_keywords(doc,
                                                keyphrase_ngram_range=(1, 1),
                                                stop_words='english',
                                                top_n=tmp,
                                                use_mmr=True,
                                                diversity=0.7)
        keyword_list_list.append([candidate[0] for candidate in tuple_list])
    pre_keyword_list_list = lemmatisation(keyword_list_list)
    res['nameWithOwner'] = list_dic['nameWithOwner']
    res['content'] = [' '.join(x) for x in pre_keyword_list_list]
    pd.DataFrame.from_dict(res, orient='columns').to_csv(out_csv, index=False)
    pass
Ejemplo n.º 16
0
    def process_query_companies(self, lib, onlyid=False, column_pos=1):
        """ Function that launch a sql query and extract main tags from column
        
        Parameters
        -----------
        lib: NLP Lib to use (Gensim or Wordcloud)
        onlyid: Check if changes only applied to one company id
        column_pos: position of the first column to extract text
        
        """

        if (onlyid):
            query = "select company.ID, company." + str(
                self.desc_field) + " from company where ID = '" + str(
                    onlyid) + "'"
        else:
            query = "select company.ID, company." + str(
                self.desc_field) + " from company "

        if ((lib != "gensim") & (lib != "wordcloud") & (lib != "keybert")):
            sys.exit("ERROR: Unknown library: " + str(lib))
        else:
            pass

        if (lib == "keybert"):
            self.model = KeyBERT('distilbert-base-nli-mean-tokens')

        updates_list = []

        try:
            # Execute the SQL command
            cursor = self.db.cursor()
            result = cursor.execute(query)
            self.db.commit()
        except Exception as e:
            print("ERROR LOADING DB ", str(e))
            pass

        n = 0
        if (result):
            rows = cursor.fetchall()
            print("Processing " + str(result) + " companies.")

            for row in rows:
                try:
                    text = row[column_pos]
                    nouns_ex = self.process_text(text, lib)
                    tags_english = self.get_keywords(nouns_ex,
                                                     self.max_words,
                                                     lib=lib)

                    if (tags_english):

                        tags_main = dict()
                        tags_all = dict()

                        tags_english_split = tags_english[1].split(";")

                        #remove predefined tags. insert main tags. (from file)

                        tags_english_split = self.remove_finaltags(
                            tags_english_split)
                        tags_english_split = self.put_maintags(
                            tags_english_split)

                        for l in self.languages:

                            tags_main[l] = self.get_first_text(
                                self.get_translation(tags_english[0], lang=l))
                            tags_all[l] = self.get_first_text(
                                self.get_translation(tags_english[1], lang=l))

                            tag_list = []

                        main_tag = False
                        second_tag = False
                        other_tags = False

                        for x in range(0, len(tags_english_split)):

                            if (len(tags_english_split[x]) > 0):
                                for s in tags_all.keys():
                                    try:
                                        tag_split = tags_all[s].split(
                                            ";")[x].strip()
                                    except:
                                        tag_split = "-"

                                    tag_list.append(tag_split)

                                self.check_and_insert_tag(
                                    tags_english_split[x], tag_list)
                                tag_list = []

                            if (x == 0):
                                main_tag = tags_english_split[x].strip()
                            elif (x == 1):
                                second_tag = tags_english_split[x].strip()
                            elif (x == 2):
                                other_tags = tags_english_split[x].strip()
                            else:
                                other_tags += ";" + tags_english_split[
                                    x].strip()

                        updates_list.append(
                            self.update_company_tags(main_tag, row[0],
                                                     tags_english[2],
                                                     second_tag, other_tags))

                    else:
                        print("WARNING: No tags extracted for ID", row[0],
                              "with text:", text)

                except Exception as e:
                    print("ERROR Processing query row: ", str(e))

        else:
            print("WARNING: NO rows for that ID.")

        #update tags - execute querys
        for i in updates_list:
            try:
                cursor.execute(i)
                self.db.commit()
            except Exception as e:
                print("Error ", str(e))
Ejemplo n.º 17
0
class NLP_Wuwana():
    def __init__(
        self,
        db,
        languages,
        weight_field,
        spacy_model,
        remove_words="./data/words_to_remove.txt",
        replace_words="./data/words_to_replace.txt",
        tags_alwaysmain="./data/finaltags_alwaysmain.txt",
        tags_toremove="./data/finaltags_toremove.txt",
        empha_words=False,
        empha_multi=1,
        desc_field="description",
        max_words=5,
    ):
        """Class defined to process wuwana description tags. It attacks db and uses 3 NLP Libraries:
        - Spacy as tokenizer.
        - Wordcloud as tag modeller.
        - Gensim as tag modeller.
        

        Parameters
        -----------
        languages: list with languages in format: ["es","fr","zh-cn"].
        remove_words: path to file of words to be removed.
        replace_words: path to file of words to be replaced.
        tags_alwaysmain: tags that will always be the main tag (just first ocurrence, in order of appeareance).
        tags_toremove: tags that will never appear.
        spacy_mode: pretrained Spacy model. 
        max_words: max words to be extracted from description texts.
        desc_field: field where text is stored in company table.
        weight_field: Field in company table where weights will be stored.
       
        """

        #file with words to be removed from tags
        self.file_words = open(remove_words, "r", encoding="utf-8")
        self.remove_words = self.file_words.read().split(";")

        #file with words to emphasize
        self.empha_multi = empha_multi
        self.empha_words = empha_words

        if (empha_words):
            self.file_empha = open(empha_words, "r", encoding="utf-8")
            self.words_to_emphasize = self.file_empha.read().split(";")

        #tags to remove
        self.file_words = open(tags_toremove, "r", encoding="utf-8")
        self.tags_toremove = self.file_words.read().split(";")

        #tags always as main
        self.file_words = open(tags_alwaysmain, "r", encoding="utf-8")
        self.tags_alwaysmain = self.file_words.read().split(";")

        #bag of words that should be replaced, such as abbreviations
        with open(replace_words, "r", encoding="utf-8") as f_in:
            self.replace_words = json.load(f_in)

        self.translator = google_translator()
        self.db = db
        self.cursor_tag = self.db.cursor()
        self.max_words = max_words
        self.desc_field = desc_field
        self.languages = languages
        self.weight_field = weight_field

        # English pretrained Spacy model
        try:
            self.nlp = spacy.load("en_core_web_lg")
        except:
            sys.exit(
                "ERROR: You must download en_core_web_lg spacy model. Use 'python -m spacy download en_core_web_lg' "
            )

    ########
    #MAIN##
    ########

    def process_query_companies(self, lib, onlyid=False, column_pos=1):
        """ Function that launch a sql query and extract main tags from column
        
        Parameters
        -----------
        lib: NLP Lib to use (Gensim or Wordcloud)
        onlyid: Check if changes only applied to one company id
        column_pos: position of the first column to extract text
        
        """

        if (onlyid):
            query = "select company.ID, company." + str(
                self.desc_field) + " from company where ID = '" + str(
                    onlyid) + "'"
        else:
            query = "select company.ID, company." + str(
                self.desc_field) + " from company "

        if ((lib != "gensim") & (lib != "wordcloud") & (lib != "keybert")):
            sys.exit("ERROR: Unknown library: " + str(lib))
        else:
            pass

        if (lib == "keybert"):
            self.model = KeyBERT('distilbert-base-nli-mean-tokens')

        updates_list = []

        try:
            # Execute the SQL command
            cursor = self.db.cursor()
            result = cursor.execute(query)
            self.db.commit()
        except Exception as e:
            print("ERROR LOADING DB ", str(e))
            pass

        n = 0
        if (result):
            rows = cursor.fetchall()
            print("Processing " + str(result) + " companies.")

            for row in rows:
                try:
                    text = row[column_pos]
                    nouns_ex = self.process_text(text, lib)
                    tags_english = self.get_keywords(nouns_ex,
                                                     self.max_words,
                                                     lib=lib)

                    if (tags_english):

                        tags_main = dict()
                        tags_all = dict()

                        tags_english_split = tags_english[1].split(";")

                        #remove predefined tags. insert main tags. (from file)

                        tags_english_split = self.remove_finaltags(
                            tags_english_split)
                        tags_english_split = self.put_maintags(
                            tags_english_split)

                        for l in self.languages:

                            tags_main[l] = self.get_first_text(
                                self.get_translation(tags_english[0], lang=l))
                            tags_all[l] = self.get_first_text(
                                self.get_translation(tags_english[1], lang=l))

                            tag_list = []

                        main_tag = False
                        second_tag = False
                        other_tags = False

                        for x in range(0, len(tags_english_split)):

                            if (len(tags_english_split[x]) > 0):
                                for s in tags_all.keys():
                                    try:
                                        tag_split = tags_all[s].split(
                                            ";")[x].strip()
                                    except:
                                        tag_split = "-"

                                    tag_list.append(tag_split)

                                self.check_and_insert_tag(
                                    tags_english_split[x], tag_list)
                                tag_list = []

                            if (x == 0):
                                main_tag = tags_english_split[x].strip()
                            elif (x == 1):
                                second_tag = tags_english_split[x].strip()
                            elif (x == 2):
                                other_tags = tags_english_split[x].strip()
                            else:
                                other_tags += ";" + tags_english_split[
                                    x].strip()

                        updates_list.append(
                            self.update_company_tags(main_tag, row[0],
                                                     tags_english[2],
                                                     second_tag, other_tags))

                    else:
                        print("WARNING: No tags extracted for ID", row[0],
                              "with text:", text)

                except Exception as e:
                    print("ERROR Processing query row: ", str(e))

        else:
            print("WARNING: NO rows for that ID.")

        #update tags - execute querys
        for i in updates_list:
            try:
                cursor.execute(i)
                self.db.commit()
            except Exception as e:
                print("Error ", str(e))

    def process_text(self, text, lib):
        """ Function that processes a text with a pipeline of tasks, and returns transformed and cleaned text 
        to be used by NLP libs
        
        Parameters
        -----------
        text:  text to extract tags
        lib: NLP library to be used afterwards (wordcloud or gensim)
        return: cleaned and transformed text
        """

        #remove hastags, mentions, and links. Comment this line to let hastags and metions appear.
        text = self.strip_all_entities(self.strip_links(text))
        #remove special chars.
        text = self.remove_special_characters(text)
        #remove emojis.
        text = self.remove_emojis(text)
        #detect source lang and translate to english if necessary.
        source_lang = self.detect_lang(text)

        #print("ORIG TEXT:", text)

        if source_lang:
            if source_lang != 'en':
                text = self.get_translation(text)

        else:
            print(
                "WARNING: No specific language detected. Translating sentences (slow)"
            )
            text = self.translate_sentence_by_sentence(text)

        #to lowercase.
        text = text.lower()

        #emphasize words if required. It repeats certain words in text (from file).
        if (self.empha_words):
            text = self.emphasize_words(text)

        # Spacy model and custom tokenizer
        self.nlp.tokenizer = self.custom_tokenizer()
        sentence = ''
        # Extract sentences
        text_lines = text.split(".")

        if (lib == "wordcloud"):

            #get nouns longer than 1 char
            for word in self.nlp(text):
                if ((word.pos_ in ['NOUN']) & (len(word.text) > 1)):
                    sentence += word.text + ' '
            #replace some words with others
            sentence = self.replace_dict(sentence)
            #remove specific words and lemmatize
            sentence = self.remove_common(sentence)
            #and lemmatize
            sentence = self.lemmatize(sentence)
            #last nouns filter
            fin_sent = ''
            for word in self.nlp(sentence):
                if word.pos_ in ["NOUN"]:
                    fin_sent += word.text + ' '

        elif ((lib == "gensim")):

            #get nouns and adjetives longer than 1 char
            for word in self.nlp(text):
                if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)):
                    sentence += word.text + ' '

            #replace some words with others
            sentence = self.replace_dict(sentence)
            #remove specific words
            sentence = self.remove_common(sentence)
            #and lemmatize
            sentence = self.lemmatize(sentence)
            fin_sent = sentence

        elif ((lib == "keybert")):

            new_lines = []

            for line in text_lines:
                new_line = []
                #get nouns and adjetives longer than 1 char
                for word in self.nlp(line):
                    if ((word.pos_ in ["NOUN", "ADJ"]) & (len(word.text) > 1)):
                        new_line.append(word.text)

                new_lines.append(" ".join(new_line))

            sentence = ". ".join(new_lines)

            #replace some words with others
            sentence = self.replace_dict(sentence)
            #remove specific words
            sentence = self.remove_common(sentence)
            #and lemmatize
            sentence = self.lemmatize(sentence)
            fin_sent = sentence
            #print("SENTENCE:",sentence)

        else:
            sys.exit("ERROR: LIB NOT FOUND: " + str(lib))

        return fin_sent

    ########
    #MYSQL##
    ########

    def update_company_tags(self,
                            first_tag,
                            idcomp,
                            weights,
                            second_tag=False,
                            other_tags=False):
        """Creates SQL Query for update the tag table
        Parameters
        -----------
        first_tag:  main tag
        idcomp: id of company 
        weights: weights of every tag
        second_tag: the second tag most relevant
        other_tags: rest of tags
        
        return: sql query to update
        
        """

        weights = self.get_weight_string(weights)

        print("\nID:", idcomp, "\nFIRST:", first_tag, "\nSECOND:", second_tag,
              "\nOTHERS:", other_tags, "\nWEIGHTS:", weights)

        if (other_tags):
            sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '{2}', {5} = '{4}'  where ID = {3}".format(
                first_tag, second_tag, other_tags, idcomp, weights,
                self.weight_field)
        elif (second_tag):
            sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='{1}', OtherTags = '', {4} = '{3}'  where ID = {2}".format(
                first_tag, second_tag, idcomp, weights, self.weight_field)
        else:
            sql_upd = "UPDATE company set FirstTagID='{0}', SecondTagID='', OtherTags = '', {3} = '{2}' where ID = {1}".format(
                first_tag, idcomp, weights, self.weight_field)

        return (sql_upd)

    def check_and_insert_tag(self, eng_tag, tags):
        """Checks if tag exists in table tag and creates if not

        Parameters
        -----------
        eng_tag: Tag in english
        tags: Rest languages tags
        return: main tag

        """

        tag_compo = ""

        for i in tags:
            tag_compo += i + ";"

        try:
            sql_tag = "Select * from tag where ID = '{0}'".format(eng_tag)
            count = self.cursor_tag.execute(sql_tag)

            if (count == 0):  #not exists
                sql_tag = "Insert into tag (ID, Names) values ('{0}', '{1}') ".format(
                    eng_tag.lower().replace("'", ""),
                    tag_compo.lower().replace("'", ""))
                self.cursor_tag.execute(sql_tag)
                self.db.commit()

            return eng_tag

        except Exception as e:
            print("ERROR: check_and_insert_tag ", str(e))

    ##########
    ###NLP####
    ##########

    def detect_lang(self, text):
        """ Function that detects the language of a text
        
        Parameters
        -----------
        text:  Text to be detected
        return: lang detected
        """

        try:
            lang = self.translator.detect(text)[0]
            return lang
        except:
            print("WARNING: No language detected in text")
            return False

    def get_translation(self, text, lang="en"):
        """ Function that translate text to english
        
        Parameters
        -----------
        text:  Text to be translated
        return: translated text
        """

        max_len = 4900  #library limit 5000

        if (len(text) > max_len):

            sub_text = ""
            for i in range(0, math.ceil(len(text) / max_len)):
                start = i * max_len
                end = (i + 1) * (max_len)
                sub_text += text[
                    start:
                    end]  #translator.translate(text[start:end], lang_tgt='en')

            text = sub_text
        else:

            text = self.translator.translate(text, lang_tgt=lang)

            if (isinstance(text, list)):
                text = text[0].replace(",", ";")
            else:
                text = text.replace(",", ";")

        time.sleep(0.5)  #1 second delay in order to avoid ip blocking
        return text

    def translate_sentence_by_sentence(self, text):
        """ Function that translate sentece by sentence a string to english. Separated by '.'
        
        Parameters
        -----------
        text:  Text to be translated
        return: translated text
        """

        sub_text = ""
        sentences = text.split(".")

        for s in sentences:
            sub_text += self.translator.translate(s, lang_tgt='en')

        return sub_text

    def replace_dict(self, sentence):
        """ Function that replace words in a sentence according to a dictionary or words (replace_words)
        
        Parameters
        -----------
        sentence:  Text to be modified
        return: cleaned text
        """

        sentence = sentence.lower()  # convert to lower case

        for word, abbr in self.replace_words.items():
            sentence = sentence.replace(word.lower(), abbr)
        return sentence

    def remove_common(self, sentence):
        """ Function that remove words in a sentence according to a dictionary or words (remove_words)
        
        Parameters
        -----------
        sentence:  Text to be modified
        return: cleaned text
        """

        final_sentence = ""

        stops = [" ", ".", ",", "-", ";"]

        # common_words to remove

        for word in sentence.split(" "):
            tmp = word.lower()
            for i in stops:
                tmp = tmp.replace(i, "")
            if tmp not in self.remove_words:
                final_sentence += word.lower() + " "

        return final_sentence

    def lemmatize(self, sentence):
        """ Function that extract lemmas from sentence
        
        Parameters
        -----------
        sentence:  Text to be analysed
        return: transformed text
        """

        self.nlp.tokenizer = self.custom_tokenizer()
        final_sentence = ''

        # common_words to remove
        for word in self.nlp(sentence):
            final_sentence += word.lemma_.lower() + ' '
        return final_sentence

    def get_weight_string(self, weights):
        """ Function that transform weight object to string.
        
        Parameters
        -----------
        weights:  Weight object returned by nlp
        return: weight transformed to string
        """

        if (isinstance(weights, dict)):  #gensim
            weights = json.dumps(weights).replace("'", "")
        elif (isinstance(weights, list)):  #wordcloud
            weights = ', '.join(str(e).replace(",", ":")
                                for e in weights).replace("'", '"').replace(
                                    "(", '').replace(")", '')
            weights = "{" + weights + "}"

        return weights

    def custom_tokenizer(self):
        """ Function that defines a tokenizer in order to be used
        
        Parameters
        -----------
        nlp:  spacy loaded object
        return: prepared tokenizer
        """

        infixes = (
            LIST_ELLIPSES + LIST_ICONS + [
                r"(?<=[0-9])[+\-\*^](?=[0-9-])",
                r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
                    al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
                r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
                #r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
                r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
            ])

        infix_re = compile_infix_regex(infixes)

        return Tokenizer(self.nlp.vocab,
                         prefix_search=self.nlp.tokenizer.prefix_search,
                         suffix_search=self.nlp.tokenizer.suffix_search,
                         infix_finditer=infix_re.finditer,
                         token_match=self.nlp.tokenizer.token_match,
                         rules=self.nlp.Defaults.tokenizer_exceptions)

    def remove_special_characters(self, text):
        """ Function that removes special characters from a text
        
        Parameters
        -----------
        text:  text to be modified
        return: cleaned text
        """
        bad_chars = [';', ':', '!', "*", "¿", "?", "¡"]

        for i in bad_chars:
            text = text.replace(i, ' ')

        return text

    def remove_emojis(self, text):
        """ Function that removes emojis from a text
        
        Parameters
        -----------
        text:  text to be modified
        return: cleaned text
        """

        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def emphasize_words(self, text):
        """ Function that repeats emphasize_words if found in text
            text: text to find words and modify.        
        """

        for i in self.words_to_emphasize:
            if i.lower() in text.lower():
                if (len(i) > 0):
                    for x in range(0, self.empha_multi):
                        text += ". " + i.lower()

        return text

    def get_keywords(self, words, amount=3, lib="wordcloud", sep=";"):
        """ Function that extract main keywords from processed text
        
        Parameters
        -----------
        words:  bag of words to extract tags
        amount: amount of of words to be extracted. 3 max words for gensim
        lib: lib to be used - gensim, wordcloud, keybert
        sep: separator for returned words 
        return: main tag, list with all tags, weighted tags
        
        """
        if (len(words) > 0):
            if (lib == "gensim"):

                tmp = keywords(words, words=min(amount, 3), split=True)
                info = keywords(words, words=min(amount, 3), scores=True)

                if (tmp):
                    return tmp[0], sep.join(tmp), info
                else:
                    return False
            elif (lib == "wordcloud"):
                listw = ""
                wcloud = wordcloud.WordCloud().generate(words)
                n = 0
                if (wcloud.words_):
                    for i in wcloud.words_:
                        if (n == 0):
                            main = i
                            listw += i + sep
                        else:
                            if (n < amount):
                                listw += i + sep
                        n += 1

                    return main, listw, wcloud.words_
                else:
                    return False

            elif (lib == "keybert"):

                tags = self.model.extract_keywords(words,
                                                   keyphrase_ngram_range=(0,
                                                                          2),
                                                   stop_words='english',
                                                   use_mmr=True,
                                                   diversity=0.2,
                                                   top_n=amount)
                if (len(tags) > 0):
                    return tags[0], sep.join(tags), ""
                else:
                    return "", "", ""

        else:
            #print("Warning: No words to extract tags: ", words)
            return False

    def strip_links(self, text):
        """ Removes urls from text
        
        Parameters
        -----------
        text: String to remove urls
        return: cleaned text
        
        """

        link_regex = re.compile(
            '((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)',
            re.DOTALL)
        links = re.findall(link_regex, text)
        for link in links:
            text = text.replace(link[0], ', ')
        return text

    def strip_all_entities(self, text):
        """ Removes rrss hastags and mentions from text
        
        Parameters
        -----------
        text: String to remove hastags  
        return: cleaned text      
        
        """

        entity_prefixes = ['@', '#']
        for separator in string.punctuation:
            if separator not in entity_prefixes:
                text = text.replace(separator, ' ')
        words = []
        for word in text.split():
            word = word.strip()
            if word:
                if word[0] not in entity_prefixes:
                    words.append(word)
        return ' '.join(words)

    def get_first_text(self, obj):
        """Extracts from abn objet:
        - first occurrence if array
        - text if string

        Parameters
        -----------
        obj: object to extract text (array or str)
        return: first ocurrence
       
        """

        if (isinstance(obj, list)):
            if (isinstance(obj[0], list)):
                obj[0][0].strip()

            else:
                return obj[0].strip()
        else:
            return obj.strip()

    def remove_finaltags(self, tags):
        """Remove tags from final processing

        Parameters
        -----------
        tags: list to be cleaned
        return: cleaned tag list
       
        """

        tmp_list = []
        for i in tags:
            if i not in self.tags_toremove:
                tmp_list.append(i)
        return tmp_list

    def put_maintags(self, tags):
        """Pririze some tags as main tag

        Parameters
        -----------
        tags: list to be modified
        return: modified tag list
       
        """

        for i in self.tags_alwaysmain:
            if (i in tags):
                pos = (tags.index(i))
                tmp = tags[0]
                tags[pos] = tmp
                tags[0] = i
                return tags

        return tags
Ejemplo n.º 18
0
        keywords = extractor.extract_keywords(current_input, pos_tags,
                                              window_length)
        keywords = keywords[:10]
    return keywords


default_doc = (
    "Compatibility of systems of linear constraints over the set of natural numbers\nCriteria of "
    "compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict "
    "inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms "
    "of construction of minimal generating sets of solutions for all types of systems are given. These "
    "criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can "
    "be used in solving all the considered types of systems and systems of mixed types. "
)

model = KeyBERT("distilbert-base-nli-mean-tokens")

st.title("Automatic Keyword Extraction")

st.markdown("<br>", unsafe_allow_html=True)
"""
[![Star](https://raw.githubusercontent.com/lasmedina/key-smith/master/GitHub-Mark-Light-32px.png)](https://github.com/lasmedina/key-smith)
"""

st.markdown(" 1. Paste your text below")
st.markdown(" 2. Select an extractor algorithm")
st.markdown(" 3. Et voilá!")

current_input = st.text_area(label="Input text:",
                             value=default_doc,
                             height=250)
Ejemplo n.º 19
0
from keybert import KeyBERT

doc = """O aprendizado automático (português brasileiro) ou a aprendizagem automática (português europeu) ou também aprendizado de máquina (português brasileiro) ou aprendizagem de máquina (português europeu) (em inglês: machine learning) é um subcampo da Engenharia e da ciência da computação que evoluiu do estudo de reconhecimento de padrões e da teoria do aprendizado computacional em inteligência artificial[1]. Em 1959, Arthur Samuel definiu aprendizado de máquina como o "campo de estudo que dá aos computadores a habilidade de aprender sem serem explicitamente programados"[2](livre tradução). O aprendizado automático explora o estudo e construção de algoritmos que podem aprender de seus erros e fazer previsões sobre dados[3]. Tais algoritmos operam construindo um modelo a partir de inputs amostrais a fim de fazer previsões ou decisões guiadas pelos dados ao invés de simplesmente seguindo inflexíveis e estáticas instruções programadas. Enquanto que na inteligência artificial existem dois tipos de raciocínio (o indutivo, que extrai regras e padrões de grandes conjuntos de dados, e o dedutivo), o aprendizado de máquina só se preocupa com o indutivo."""

#paraphrase-xlm-r-multilingual-v1
#bert-base-multilingual-cased

model = KeyBERT('bert-base-multilingual-cased')
keywords = model.extract_keywords(doc)

model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None)

model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None)
Ejemplo n.º 20
0
import pytest
from .utils import get_test_data
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT

doc_one, doc_two = get_test_data()
model = KeyBERT(model="all-MiniLM-L6-v2")


@pytest.mark.parametrize("keyphrase_length", [(1, i + 1) for i in range(5)])
@pytest.mark.parametrize(
    "vectorizer", [None, CountVectorizer(ngram_range=(1, 1), stop_words="english")]
)
def test_single_doc(keyphrase_length, vectorizer):
    """Test whether the keywords are correctly extracted"""
    top_n = 5

    keywords = model.extract_keywords(
        doc_one,
        keyphrase_ngram_range=keyphrase_length,
        min_df=1,
        top_n=top_n,
        vectorizer=vectorizer,
    )

    assert isinstance(keywords, list)
    assert isinstance(keywords[0], tuple)
    assert isinstance(keywords[0][0], str)
    assert isinstance(keywords[0][1], float)
    assert len(keywords) == top_n
    for keyword in keywords:
Ejemplo n.º 21
0
 def set_keyword_score_list(self, **kwargs):
     extractor = KeyBERT('distilbert-base-nli-mean-tokens')
     stop_words = kwargs.get('stop_words', 'english')
     self._keyword_score_list = extractor.extract_keywords(self._document, keyphrase_ngram_range=(1, 4), stop_words=stop_words)[:len(self._document)]
Ejemplo n.º 22
0
parser.add_argument("--device",
                    type=str,
                    default=('0' if torch.cuda.is_available() else 'cpu'))
parser.add_argument(
    "--model_file",
    type=str,
    default="pretrained_models/conceptnet_pretrained_model.pickle")
parser.add_argument("--sampling_algorithm", type=str, default="beam-3")

args = parser.parse_args()

# %%
sentence_model = SentenceTransformer(
    "stsb-distilbert-base",
    device=('cuda:' + args.device if torch.cuda.is_available() else 'cpu'))
kb_model = KeyBERT(model=sentence_model)

bc = BertClient()
# start bert service command:
# bert-serving-start -model_dir ~/.bert-as-service/uncased_L-24_H-1024_A-16/ -num_worker=4
# or
# nohup bert-serving-start -model_dir ~/.bert-as-service/uncased_L-24_H-1024_A-16/ -num_worker=4 &

# %%
opt, state_dict = interactive.load_model_file(args.model_file)

data_loader, text_encoder = interactive.load_data("conceptnet", opt)

n_ctx = data_loader.max_e1 + data_loader.max_e2 + data_loader.max_r
n_vocab = len(text_encoder.encoder) + n_ctx

def initialization() -> tuple:
    parser = argparse.ArgumentParser(description='Test')
    parser.add_argument('-path_to_dict', type=str, default='TITLE.csv')
    parser.add_argument('-path_to_text', type=str, default='text.txt')
    parser.add_argument('-path_to_save', type=str, default='save.json')

    args = parser.parse_args()
    return args.path_to_dict, args.path_to_text, args.path_to_save


PATH_TO_DICT, PATH_TO_TEXT, PATH_TO_SAVE = initialization()
data = set(map(lambda word: word.lower(),
               list(pd.read_csv(PATH_TO_DICT).Word)))
model = KeyBERT('distilbert-base-nli-mean-tokens')
result = {}
key_words = set()


class FindThread(Thread):
    def __init__(self, word, sent_index, word_index):
        Thread.__init__(self)
        self.word = word
        self.sent_index = sent_index
        self.word_index = word_index

    def run(self):
        if self.word in data:
            update_dict(dict_=result,
                        key=self.word,
Ejemplo n.º 24
0
def base_keybert():
    model = KeyBERT(model='distilbert-base-nli-mean-tokens')
    return model
Ejemplo n.º 25
0
    topics_tfidf_matrix = make_tfidf_matrix(topic_docs, topic_ids)
    tweets_tfidf_matrix = make_tfidf_matrix(tweet_docs, topic_ids)

    # named tuple for keyword representation in json
    Keyword = namedtuple("Keyword", ["keyword", "freq", "w_recall"])

    results_list = []

    # min_length and max_length to force the keywords to be just one word
    # Try with 3 words (it is said to be the optimal length)
    r = Rake(language="english", min_length=1, max_length=1)
    # n=1 to force kw to be one word | top=10 to get top 10 best ranked kw
    # Try with n=3
    yake = KeywordExtractor(lan="en", n=1, top=TOP_N_KEYWORDS)

    key_bert = KeyBERT('distilbert-base-nli-mean-tokens')

    iteration, total_iterations = 1, len(topic_ids)
    # Begin topic cycle
    for filename in os.listdir(NEWS_DIR):
        start = time.time()
        topic_id = filename.split(".")[0]
        with open(os.path.join(NEWS_DIR, filename), "r",
                  encoding="utf-8") as f:
            news = f.readlines()
        with open(os.path.join(TWEETS_DIR, filename), "r",
                  encoding="utf-8") as f:
            tweets = f.readlines()

        n_tweets = len(tweets)
        news = ''.join(news)
Ejemplo n.º 26
0
 def train(self, documents, **kwargs):
     extractor = KeyBERT('distilbert-base-nli-mean-tokens')
     stop_words = kwargs.get('stop_words', 'english')
     self.the_total_keywords = extractor.extract_keywords(
         ' '.join(documents),
         keyphrase_ngram_range=(1, 5))[:self.total_keywords_in_training]
Ejemplo n.º 27
0
from flask.wrappers import Response
from keybert import KeyBERT
from annoy import AnnoyIndex
from sentence_transformers import SentenceTransformer
import json
import fasttext
from torch.functional import tensordot
import pke
import uuid
import string
from nltk.corpus import stopwords
import os

app = Flask(__name__)

keyBERT_model = KeyBERT('distilbert-base-nli-mean-tokens')
BERT_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

embed_dim = 768
tree_Telecom = AnnoyIndex(embed_dim,'angular')
tree_Telecom.load('models/annoy/tree_Telecom.ann') # Department_of_Telecommunications

tree_IncomeTax = AnnoyIndex(embed_dim,'angular')# Central_Board_of_Direct_Taxes_(Income_Tax)
tree_IncomeTax.load('models/annoy/tree_IncomeTax.ann')

tree_Labour = AnnoyIndex(embed_dim,'angular')    # Ministry_of_labour_and_Employment
tree_Labour.load('models/annoy/tree_Labour.ann')

tree_Finance = AnnoyIndex(embed_dim,'angular')   # Department_of_Financial_Services_(Banking_Division)
tree_Finance.load('models/annoy/tree_Finance.ann')
 def __init__(self):
     self.model = KeyBERT('xlm-r-distilroberta-base-paraphrase-v1')
     self.textInput = ""
Ejemplo n.º 29
0
from keybert import KeyBERT
import json
import scipy
import nltk
from sentence_transformers import SentenceTransformer
#Requires
#pip install -U sentence-transformers
#pip install keybert

nltk.download('stopwords')
vectorizer = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
model = KeyBERT('distilbert-base-nli-mean-tokens')


def porterStem(paragraph):
    stem = nltk.stem.porter.PorterStemmer().stem
    data = []
    for word in paragraph.split():
        if (word in nltk.corpus.stopwords.words('english')):
            continue
        data.append(stem(word))

    return " ".join(data)


texts = []
titles = []
labels = []
dictLabels = {}

with open('../data/truth.jsonl') as file:
class KeyBertExtractor():
    """https://github.com/MaartenGr/KeyBERT"""

    #TODO there really are many many configs and I think changing these changes a great deal! see https://github.com/MaartenGr/KeyBERT and try out stuff!!
    #TODO there is a minimum-frequency-argument!! https://github.com/MaartenGr/KeyBERT/blob/master/keybert/_model.py#L83-L101
    #TODO does this use the phrase_in_text function? SHOULD IT?

    def __init__(self, is_multilan, faster=False, max_ngram=1):
        """available models: https://github.com/MaartenGr/KeyBERT#25-embedding-models"""
        from keybert import KeyBERT  #lazily loaded as it needs tensorflow which takes some time to init
        assert not (is_multilan and faster)
        if faster:
            self.model_name = "paraphrase-MiniLM-L6-v2"
        elif is_multilan:
            self.model_name = "paraphrase-multilingual-MiniLM-L12-v2"
        else:
            self.model_name = "paraphrase-mpnet-base-v2"
        print(f"Using model {self.model_name}")
        self.kw_model = KeyBERT(self.model_name)
        self.max_ngram = max_ngram

    def _fix_hyphenated(self, cand, comparedtext):
        # it may be the case that the candiate is something like "particle systems", however the text only has "many-particle systems".
        # if so, then `(not phrase_in_text(cand, without_stops)) and cand in without_stops == True`
        words_before_onset = comparedtext[:comparedtext.find(cand)].count(" ")
        chars_before_onset = len(" ".join(
            comparedtext.split(" ")[:words_before_onset]))
        if chars_before_onset > 0 and chars_before_onset + 1 != comparedtext.find(
                cand):
            # then the first word is hyphenated
            return comparedtext[chars_before_onset +
                                1:comparedtext.find(cand) + len(cand)]
        elif words_before_onset == 0 and bool(
                re.fullmatch(WORD_NUM_REGEX,
                             comparedtext[:comparedtext.find(cand)])):
            return comparedtext[:comparedtext.find(cand) + len(cand)]
        else:
            # then not the first word is hyphenated, but the last
            chars_after_hyphen = comparedtext[comparedtext.find(cand) +
                                              len(cand):].find(" ")
            if chars_after_hyphen > 0:
                return comparedtext[comparedtext.
                                    find(cand):comparedtext.find(cand) +
                                    len(cand) + chars_after_hyphen]
            elif re.fullmatch(
                    WORD_NUM_REGEX,
                    comparedtext[comparedtext.find(cand) + len(cand):]):
                return comparedtext[comparedtext.find(cand):]
        print("hm?!")
        return "NOPE"

    def extract_candidate(self, cand, text, without_stops, inds_without_stops,
                          only_words, inds_only_words):
        #TODO not sure if this version can also correct hyphenated stuff like the old one ARGH!!

        if (not phrase_in_text(cand, without_stops)) and cand in without_stops:
            cand = self._fix_hyphenated(cand, without_stops)
            if phrase_in_text(cand, text):  #maybe we're already done here
                return cand
            #now the cand is fixed and you can continue to checking phrase_in_text

        if phrase_in_text(cand, without_stops):
            tokenized_with_stops = tokenize_text(text, stopwords=None)[1]
            startpos = without_stops.find(cand)
            start_ind = without_stops[:startpos].count(" ")
            stoppos = startpos + len(cand)
            stop_ind = start_ind + without_stops[startpos:stoppos].count(" ")
            actual_phrase = " ".join(tokenized_with_stops[
                inds_without_stops[start_ind]:inds_without_stops[stop_ind] +
                1])
            if phrase_in_text(actual_phrase, text):
                if actual_phrase.split(" ")[0] == cand.split(
                        " ")[0] and actual_phrase.split(" ")[-1] == cand.split(
                            " ")[-1]:
                    # print(f"FROM {cand} TO {actual_phrase}")
                    return actual_phrase
                else:
                    print()
                    return
            print()
            return

        if (not phrase_in_text(cand, only_words)) and cand in only_words:
            cand = self._fix_hyphenated(cand, only_words)
            #now the cand is fixed and you can continue to checking phrase_in_text

        if phrase_in_text(cand, only_words):
            tokenized_with_stops = tokenize_text(text, stopwords=None)[1]
            startpos = only_words.find(cand)
            start_ind = only_words[:startpos].count(" ")
            stoppos = startpos + len(cand)
            stop_ind = start_ind + only_words[startpos:stoppos].count(" ")
            actual_phrase = " ".join(tokenized_with_stops[
                inds_only_words[start_ind]:inds_only_words[stop_ind] + 1])
            if any(
                    i in actual_phrase[:-1] for i in list("?!") + ['"']
            ):  #if the phrase is not an actual phrase but split by punctuation
                print(
                    f"{cand} is not an actual phrase - in the text it is `{actual_phrase}`"
                )
                return None
            if phrase_in_text(actual_phrase, text):
                if actual_phrase.split(" ")[0] == cand.split(
                        " ")[0] and actual_phrase.split(" ")[-1] == cand.split(
                            " ")[-1]:
                    # print(f"FROM {cand} TO {actual_phrase}")
                    return actual_phrase
                else:
                    print()
                    return
            print()
            return

        if cand in without_stops:
            print("In without_stops")
            return

        if cand in only_words:
            print("in only_words")
            return

        #another thing: cand is "internship self organization", but in the text it's "internship self-organization". Maybe remove everything but letters and then re-apply?
        c2 = re.sub(re.compile(r'[\W\d]', re.U), "|", cand)
        t2 = re.sub(re.compile(r'[\W\d]', re.U), "|", text).lower()
        if c2 in t2:
            cand = text[t2.find(c2):t2.find(c2) + len(c2)]
            if phrase_in_text(cand, text):
                return cand
            else:
                print("whatever.")
        w2 = re.sub(re.compile(r'[\W\d]', re.U), "|", without_stops)
        if c2 in w2:
            cand = without_stops[w2.find(c2):w2.find(c2) + len(c2)]
            return self.extract_candidate(cand, text, without_stops,
                                          inds_without_stops, only_words,
                                          inds_only_words)
        o2 = re.sub(re.compile(r'[\W\d]', re.U), "|", only_words)
        if c2 in o2:
            cand = only_words[o2.find(c2):o2.find(c2) + len(c2)]
            return self.extract_candidate(cand, text, without_stops,
                                          inds_without_stops, only_words,
                                          inds_only_words)

        print(f"This does not work: {cand}")

    def __call__(self, text, lang="en"):  #TODO lang shouldn't be en!!!
        """see scripts/notebooks/proof_of_concept/proofofconcept_keyBERT.ipynb for why this is like this"""
        #TODO so extract_keywords can be passed a `vectorizer`, and that is by default Sklearn's CountVectorizer.
        # You can ALSO pass `candidates`, "to use instead of extracting them from the document(s)"!!!
        # Put a breakpoint in /home/chris/.local/lib/python3.8/site-packages/sklearn/feature_extraction/text.py:395 for details
        # TODO also why do I get this ^ warning ("Your stop_words may be inconsistent with your preprocessing") ??
        #    Does KeyBERT need already preprocessed descriptions?! if so, how much preprocessed, and how do I know this??!

        stopwords = get_stopwords(lang)
        candidates = set()
        for nwords in range(1, self.max_ngram):
            n_candidates = self.kw_model.extract_keywords(
                text, keyphrase_ngram_range=(1, nwords), stop_words=stopwords)
            candidates |= set(i[0] for i in n_candidates)
        candidates = list(candidates)

        #TODO: what if there are special chars in the candidates? is everything ok then with the word-splitting?
        #TODO does this work for numbers?!
        inds_without_stops, without_stops = tokenize_text(text, stopwords)
        ind_word_list = [
            (ind, word) for ind, word in zip(inds_without_stops, without_stops)
            if WORD_NUM_REGEX.fullmatch(word)
        ]
        inds_only_words, only_words = list(zip(
            *ind_word_list)) if ind_word_list else ([], [])
        without_stops = " ".join(without_stops)
        only_words = " ".join(only_words)
        actual_keyphrases = []
        used_candidates = []
        n_immediateworking = n_fixed = n_errs = 0
        for cand in candidates:
            # if not all(WORD_REGEX.fullmatch(i) for i in cand.split(" ")):
            #     print(f"The candidate `{cand}` is not purely textual!")

            if phrase_in_text(cand, text):
                actual_keyphrases.append(cand)
                used_candidates.append(cand)
                n_immediateworking += 1
            else:
                intextcand = self.extract_candidate(cand, text, without_stops,
                                                    inds_without_stops,
                                                    only_words,
                                                    inds_only_words)
                #TODO wenn in candidate ne zahl oder so ist die entfernen und es neu versuchen

                if intextcand:
                    if phrase_in_text(intextcand, text):
                        actual_keyphrases.append(intextcand)
                        used_candidates.append(cand)
                        n_fixed += 1
                        continue
                    else:
                        print(
                            "The extracted candidate is STILL not in the text!"
                        )
                n_errs += 1

        return actual_keyphrases, used_candidates, (n_immediateworking,
                                                    n_fixed, n_errs)