def createListOfFrequentTerms(df, column, max_ngram_size=2, numOfKeywords=40): concat_string = '' for val in df[column]: if is_nan(val) or val == None or val == "": pass else: concat_string = concat_string + ', ' + str(val) stop_words = [] kw_extractor = yake.KeywordExtractor() # text = """spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. The library is published under the MIT license and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion.""" text = concat_string language = "en" deduplication_threshold = 0.9 custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None) keywords = custom_kw_extractor.extract_keywords(text) sorted_list = [] for kw in keywords: sorted_list.append(kw[0]) print(kw) return sorted_list
def extract_keywords(self): content = self.get_cleantext(self.scrape_submission()) kwextractor1 = yake.KeywordExtractor(n=1) keywords1 = kwextractor1.extract_keywords(content) kwextractor2 = yake.KeywordExtractor(n=2) keywords2 = kwextractor2.extract_keywords(content) final = keywords1[:5] + keywords2[:5] return final
def extractKeyWords(transcript): kw_extractor = yake.KeywordExtractor() language = "en" max_ngram_size = 3 deduplication_threshold = 0.9 numOfKeywords = 20 custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None) keywords = custom_kw_extractor.extract_keywords(transcript) return keywords
def getKeywords(text): #text= '''Essentially Marvel's flagship TV show, Agents of S.H.I.E.L.D. launched in 2013 and has already been renewed for an (abbreviated) seventh season for 2019-2020. Clark Gregg starred as Agent Coulson, with a two-season-long plot exploring the mystery of just how Coulson was resurrected after his death in The Avengers. Little by little, though, Chloe Bennet has become the series star, S.H.I.E.L.D.'s very own superhero, an Inhuman with the potential to literally tear the Earth apart. The rest of the cast is stellar, and each character has the kind of nuance and depth that's only possible when an actor really inhabits their role. The show's greatest strength is the fact that it can essentially be anything it wants to be; S.H.I.E.L.D. can plunge into a supernatural thriller alongside a new version of Ghost Rider, or be trapped in a dystopian future timeline in a hard sci-fi plot. Over the years, it's developed a mythology all of its own, one that allows it to stand a little bit more separate to the movies nowadays.''' language = "en" max_ngram_size = 3 deduplication_thresold = 0.9 deduplication_algo = 'seqm' windowSize = 1 numOfKeywords = 5 custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) keywords = custom_kw_extractor.extract_keywords(text) output = [] for key in keywords: output.append(key[0]) # th = TextHighlighter(max_ngram_size = 3, highlight_pre = "<span class='my_class' >", highlight_post= "</span>") # th.highlight(text, keywords) # # for kw in keywords: # print(kw) return output
def searchKeywords(startString, keywordSearch, language, topNumber): ignored = set(["conseil communal", "conseil général"]) kw_extractor = yake.KeywordExtractor(lan=language, top=topNumber) data_path = os.path.dirname(os.path.abspath(__file__))+ f"\\..\\rapport\\data\\txt\\" files = os.listdir(data_path) kwResult = [] for f in sorted(files): if f.startswith(startString): print("try open" + f) text = open(data_path + f, encoding="latin-1").read() try: keywords = kw_extractor.extract_keywords(text) kept = [] for score, kw in keywords: words = kw.split() if len(words) > 0 and kw not in ignored: kept.append(kw) for k in kept: for w in keywordSearch: print(k + " - " + w) if k.find(w) > -1: kwResult.append(f) print("add "+ f) except Exception as ex: print("Impossible to extract keyword in " + f + " file:") print(ex) pass #print(f"{f} mentions these keywords: {', '.join(kept)}...") print(len(kwResult)) return kwResult
def filterKeywords(results, raw_query): """ filter for keywords by checking if any of the query keywords are in the article titles Function is hardwired to the search query formatting currently used """ actual_query = raw_query.partition('(')[ 0] # this contains the actual query in string format words = actual_query.split() # this contains the query in list format if (len(words) >= 4): #keyword analysis. DISCLAIMER: is slowwwww kw_extractor = yake.KeywordExtractor() keywords = kw_extractor.extract_keywords(actual_query) for kw in keywords: if ((len(kw[0].split())) == 1): words.append(kw[0]) # else skip for i in results: pass_fail = False for word in words: contained = False # initialization if (word.lower() in (i.get('name')).lower()): contained = True else: contained = False pass_fail = pass_fail or contained if (pass_fail == False): results.remove(i) return results
def getanalysis(df): lis = [] language = "en" max_ngram_size = 1 deduplication_thresold = 0.9 deduplication_algo = 'seqm' windowSize = 1 numOfKeywords = 2 custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) for i in df["Description"]: if (i == ''): continue keywords = custom_kw_extractor.extract_keywords(i) temp = [] for j in keywords: temp.append(j[0]) lis.append(temp) return lis
def train(self, documents, **kwargs): """Unsupervised train the keyword extractor on a list of documents Arguments: documents {List[str]} -- [description] """ with open('indexList.csv', newline='') as f: reader = csv.reader(f) index_keywords_ = list(reader) index_keywords = [] for item in index_keywords_: a = (item[0], int(item[1])) index_keywords.append(a) total_data = ' '.join(documents) language = kwargs.get('language', 'en') max_ngram_size = self.n_gram deduplication_thresold = 0.4 # 0.4 -> deduplication_algo = 'seqm' # windowSize = 2 numOfKeywords = self.total_keywords_in_training custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) self.the_total_keywords = index_keywords + custom_kw_extractor.extract_keywords( total_data)
def train(self, documents, **kwargs): """Unsupervised train the keyword extractor on a list of documents Arguments: documents {List[str]} -- [description] """ total_data = ' '.join(documents) language = kwargs.get('language', 'en') max_ngram_size = self.n_gram deduplication_thresold = 0.7 # 0.4 -> deduplication_algo = 'seqm' # windowSize = 1 numOfKeywords = self.total_keywords_in_training custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) self.the_total_keywords = custom_kw_extractor.extract_keywords( total_data)
def extract_keywords(): workshops_file = 'all-workshops-2021-02-04.csv' workshops_df = get_workshops_df(workshops_file) # Writing the choose workshop prompt as a subheader to a list of workshop titles to choose from via a selection box st.subheader("Choose a workshop title to see the top three Yake extracted keywords from the workshop's description") title_list = workshops_df["title"].to_list() selected_workshop = st.selectbox("Select workshop", title_list) # Writing the title of the chosen workshop st.write(selected_workshop) # Go through the workshop data frame to find all workshops with the same title selected from the list above filtered_workshops = workshops_df[workshops_df["title"] == selected_workshop] # For the the workshops filtered by chosen title, apply the custom yake function to the body or descriptions of those workshops custom_kw_extractor = yake.KeywordExtractor(top = 3) filtered_workshops["keywords_yake"] = filtered_workshops["body"].apply(get_top_three, args=(custom_kw_extractor,)) #st.dataframe(filtered_workshops[["body","keywords_yake"]],width = 600) # Formatting the returned workshop descriptions with their keywords into two columns col_description, col_keywords = st.beta_columns(2) with col_description: st.write(filtered_workshops["body"]) with col_keywords: st.write(filtered_workshops["keywords_yake"])
def find_keywords(text_in): # parameters language = "en" max_ngram_size = 3 deduplication_thresold = 0.9 deduplication_algo = 'jaro' windowSize = 1 numOfKeywords = 150 # yake extractor initialze with parameters custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) # extract keywords based on the parameters keywords_output = custom_kw_extractor.extract_keywords(text_in) # create dataframe of keywords output keywords_df = pd.DataFrame(keywords_output, columns=["word", "score"]) del keywords_df['score'] # make a list of just the words (exclude the scores) keywords = keywords_df['word'].to_list() # since every bi/trigram is going to include a unigram already.... # lets just create the basis of our word search based on unigrams unigram_df = keywords_df[keywords_df['word'].apply( lambda x: len(x.split()) == 1)] # create column showing bi and trigrams that include the unigrams (for context) unigram_df['associated_phrases'] = unigram_df['word'].apply( word_in_list, args=(keywords, )) return unigram_df
def extract_keywords(data): special_inputs = [':)', '.'] kw_extractor = yake.KeywordExtractor(n=n_gram) num_convs = len(list(data.keys())) print('Extracting keywords for all utterances of {} conversations ...'. format(num_convs)) new_data = {} new_conv = {} for c in tqdm(range(num_convs)): conv = list(data.values())[c]['content'] new_conv = list(data.values())[c] num_utts = len(conv) for u in range(num_utts): utterance = conv[u]['message'] if utterance in special_inputs: utterance = 'i' kws = kw_extractor.extract_keywords(utterance) #print(kws) kws_1 = get_kwd_1(kws) kws_2 = get_kwd_2(kws) kws_3 = get_kwd_3(kws) #print('kw1: {}'.format(kws_1)) #print('kw2: {}'.format(kws_2)) #print('kw3: {}'.format(kws_3)) new_conv['content'][u]['keywords_1'] = kws_1 new_conv['content'][u]['keywords_2'] = kws_2 new_conv['content'][u]['keywords_3'] = kws_3 new_data[list(data.keys())[c]] = new_conv print('Done!') return new_data
def get_keyword(docs): """ Function to extract keywords using YAKE from the given list of strings. :param docs: Strings to extract keywords from <Returns a list of string where each string contains keywords seperated by ','> """ # Params to be passed for YAKE keyword Extractor language = "en" max_ngram_size = 3 deduplication_thresold = 0.9 deduplication_algo = 'seqm' numOfKeywords = 1000 # Initialization list_of_keys = list() custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, top=numOfKeywords, features=None) # Iterate over each document and get keywords for loc, each_article in enumerate(docs): keywords = custom_kw_extractor.extract_keywords(each_article) temp1 = list() for i, j in keywords: temp1.append(j) list_of_keys.append(",".join(temp1)) return list_of_keys
def add_metadata(text): print("| **** Extracting metadata from text: Done") text = text language = "pt" max_ngram_size = 1 deduplication_thresold = 0.9 deduplication_algo = 'seqm' windowSize = 1 numOfKeywords = 4 ####################################################################### # Extraindo quatro keywords do conteudo de metadados ####################################################################### custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) keywords = custom_kw_extractor.extract_keywords(text) # print(len(keywords)) # print(keywords) print("| **** Adding Metadata to instances: Done") return (keywords)
def yake_keyword_extraction( text: str, parameters: Dict = parameters ) -> List[Tuple[str, float]]: custom_kw_extractor = yake.KeywordExtractor(**parameters) keywords = custom_kw_extractor.extract_keywords(text) return keywords
def yake_keywords( texts: List[str], language: str = "English", max_len: int = 1, progress_callback: Callable = None) -> List[List[Tuple[str, float]]]: """ Extract keywords using YAKE!. Parameters ---------- texts : list List of documents. language : str Selected language. max_len : int Maximum number of tokens. progress_callback : callable Function for reporting progress. Returns ------- keywords : list """ if progress_callback is None: progress_callback = dummy_callback language = YAKE_LANGUAGE_MAPPING[language] extractor = yake.KeywordExtractor(lan=language, n=max_len) keywords = [] n_docs = len(texts) for i, text in enumerate(texts): progress_callback(i / n_docs) keywords.append(extractor.extract_keywords(text)) return keywords
def __init__(self, path: str, language: str = "en", max_ngram_size: int = 3, numOfKeywords: int = 50, deduplication_threshold: float = 0.9, deduplication_algo: str = 'seqm', windowSize=1, k=3): self.language = language self.max_ngram_size = max_ngram_size self.numOfKeywords = numOfKeywords self.deduplication_threshold = deduplication_threshold self.deduplication_algo = deduplication_algo self.windowSize = windowSize self.custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) self.nlp = spacy.load("en_core_web_lg") self.vec_len = len(self.nlp("cosine").vector) self.k = k self.text = open(path, "r", encoding='UTF-8').read()
def yake_keyword(doc): """ Extracts keywords from the given text using yake. Args: doc: Paragraph from keywords need to be extracted. Returns: Returns Keywords extracted from the text document passed. """ language = "en" max_ngram_size = 3 deduplication_thresold = 0.9 deduplication_algo = 'seqm' windowSize = 1 numOfKeywords = 20 extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) keywords = extractor.extract_keywords(doc) keywords = [word for number, word in keywords] return keywords
def run_yake(path, key_phrase_dict, no_of_words): print('running Yake') text = open(path, 'r').read() language = "en" max_ngram_size = 3 deduplication_thresold = 0.9 deduplication_algo = 'seqm' windowSize = 1 numOfKeywords = no_of_words custom_kw_extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) keywords = custom_kw_extractor.extract_keywords(text) key_phrase_dict['YAKE'] = [word[0] for word in keywords] return key_phrase_dict #print(run_yake('/home/admindell/Documents/topic-modelling/sampletext.txt',{}))
def keywords1(texts_2): global text4, keywords_yake keywords_yake = [] # Reka setup with stopword directory text4 = str(texts_2) # # Using Rake # stop_dir = "SmartStoplist.txt" # rake_object = RAKE.Rake(stop_dir) # # Extract keywords # keywords = rake_object.run(text4) # print ("keywords: ", keywords[0:20]) #Using Yake language = "en" max_ngram_size = 3 deduplication_thresold = 0.9 deduplication_algo = 'seqm' windowSize = 1 numOfKeywords = 20 custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) keywords = custom_kw_extractor.extract_keywords(text4) for kw in keywords: print(kw) # print(kw[0]) keywords_yake.append(kw[0]) return keywords_yake
def test_phraseless_example(): text_content = "- not yet" pyake = yake.KeywordExtractor() result = pyake.extract_keywords(text_content) assert len(result) == 0
def test_n3_PT(): text_content = ''' "Conta-me Histórias." Xutos inspiram projeto premiado. A plataforma "Conta-me Histórias" foi distinguida com o Prémio Arquivo.pt, atribuído a trabalhos inovadores de investigação ou aplicação de recursos preservados da Web, através dos serviços de pesquisa e acesso disponibilizados publicamente pelo Arquivo.pt . Nesta plataforma em desenvolvimento, o utilizador pode pesquisar sobre qualquer tema e ainda executar alguns exemplos predefinidos. Como forma de garantir a pluralidade e diversidade de fontes de informação, esta são utilizadas 24 fontes de notícias eletrónicas, incluindo a TSF. Uma versão experimental (beta) do "Conta-me Histórias" está disponível aqui. A plataforma foi desenvolvida por Ricardo Campos investigador do LIAAD do INESC TEC e docente do Instituto Politécnico de Tomar, Arian Pasquali e Vitor Mangaravite, também investigadores do LIAAD do INESC TEC, Alípio Jorge, coordenador do LIAAD do INESC TEC e docente na Faculdade de Ciências da Universidade do Porto, e Adam Jatwot docente da Universidade de Kyoto. ''' pyake = yake.KeywordExtractor(lan="pt", n=3) result = pyake.extract_keywords(text_content) res = [('Conta-me Histórias', 0.006225012963810038), ('LIAAD do INESC', 0.01899063587015275), ('INESC TEC', 0.01995432290332246), ('Conta-me', 0.04513273690417472), ('Histórias', 0.04513273690417472), ('Prémio Arquivo.pt', 0.05749361520927859), ('LIAAD', 0.07738867367929901), ('INESC', 0.07738867367929901), ('TEC', 0.08109398065524037), ('Xutos inspiram projeto', 0.08720742489353424), ('inspiram projeto premiado', 0.08720742489353424), ('Adam Jatwot docente', 0.09407053486771558), ('Arquivo.pt', 0.10261392141666957), ('Alípio Jorge', 0.12190479662535166), ('Ciências da Universidade', 0.12368384021490342), ('Ricardo Campos investigador', 0.12789997272332762), ('Politécnico de Tomar', 0.13323587141127738), ('Arian Pasquali', 0.13323587141127738), ('Vitor Mangaravite', 0.13323587141127738), ('preservados da Web', 0.13596322680882506)] assert result == res keywords = [kw[0] for kw in result] th = TextHighlighter(max_ngram_size=3) textHighlighted = th.highlight(text_content, keywords) print(textHighlighted) assert textHighlighted == '"<kw>Conta-me Histórias</kw>." <kw>Xutos inspiram projeto</kw> premiado. A plataforma "<kw>Conta-me Histórias</kw>" foi distinguida com o <kw>Prémio Arquivo.pt</kw>, atribuído a trabalhos inovadores de investigação ou aplicação de recursos <kw>preservados da Web</kw>, através dos serviços de pesquisa e acesso disponibilizados publicamente pelo <kw>Arquivo.pt</kw> . Nesta plataforma em desenvolvimento, o utilizador pode pesquisar sobre qualquer tema e ainda executar alguns exemplos predefinidos. Como forma de garantir a pluralidade e diversidade de fontes de informação, esta são utilizadas 24 fontes de notícias eletrónicas, incluindo a TSF. Uma versão experimental (beta) do "<kw>Conta-me Histórias</kw>" está disponível aqui. A plataforma foi desenvolvida por <kw>Ricardo Campos investigador</kw> do <kw>LIAAD do INESC</kw> <kw>TEC</kw> e docente do Instituto <kw>Politécnico de Tomar</kw>, <kw>Arian Pasquali</kw> e <kw>Vitor Mangaravite</kw>, também investigadores do <kw>LIAAD do INESC</kw> <kw>TEC</kw>, <kw>Alípio Jorge</kw>, coordenador do <kw>LIAAD do INESC</kw> <kw>TEC</kw> e docente na Faculdade de <kw>Ciências da Universidade</kw> do Porto, e <kw>Adam Jatwot docente</kw> da Universidade de Kyoto.'
def test_n3_EN(): text_content = ''' Google is acquiring data science community Kaggle. Sources tell us that Google is acquiring Kaggle, a platform that hosts data science and machine learning competitions. Details about the transaction remain somewhat vague , but given that Google is hosting its Cloud Next conference in San Francisco this week, the official announcement could come as early as tomorrow. Reached by phone, Kaggle co-founder CEO Anthony Goldbloom declined to deny that the acquisition is happening. Google itself declined 'to comment on rumors'. Kaggle, which has about half a million data scientists on its platform, was founded by Goldbloom and Ben Hamner in 2010. The service got an early start and even though it has a few competitors like DrivenData, TopCoder and HackerRank, it has managed to stay well ahead of them by focusing on its specific niche. The service is basically the de facto home for running data science and machine learning competitions. With Kaggle, Google is buying one of the largest and most active communities for data scientists - and with that, it will get increased mindshare in this community, too (though it already has plenty of that thanks to Tensorflow and other projects). Kaggle has a bit of a history with Google, too, but that's pretty recent. Earlier this month, Google and Kaggle teamed up to host a $100,000 machine learning competition around classifying YouTube videos. That competition had some deep integrations with the Google Cloud Platform, too. Our understanding is that Google will keep the service running - likely under its current name. While the acquisition is probably more about Kaggle's community than technology, Kaggle did build some interesting tools for hosting its competition and 'kernels', too. On Kaggle, kernels are basically the source code for analyzing data sets and developers can share this code on the platform (the company previously called them 'scripts'). Like similar competition-centric sites, Kaggle also runs a job board, too. It's unclear what Google will do with that part of the service. According to Crunchbase, Kaggle raised $12.5 million (though PitchBook says it's $12.75) since its launch in 2010. Investors in Kaggle include Index Ventures, SV Angel, Max Levchin, Naval Ravikant, Google chief economist Hal Varian, Khosla Ventures and Yuri Milner''' pyake = yake.KeywordExtractor(lan="en", n=3) result = pyake.extract_keywords(text_content) print(result) res = [('Google', 0.02509259635302287), ('Kaggle', 0.027297150442917317), ('CEO Anthony Goldbloom', 0.04834891465259988), ('data science', 0.05499112888517541), ('acquiring data science', 0.06029572445726576), ('Google Cloud Platform', 0.07461585862381104), ('data', 0.07999958986489127), ('San Francisco', 0.0913829662674319), ('Anthony Goldbloom declined', 0.09740885820462175), ('science', 0.09834167930168546), ('science community Kaggle', 0.1014394718805728), ('machine learning', 0.10754988562466912), ('Google Cloud', 0.1136787749431024), ('Google is acquiring', 0.114683257931042), ('acquiring Kaggle', 0.12012386507741751), ('Anthony Goldbloom', 0.1213027418574554), ('platform', 0.12404419723925647), ('co-founder CEO Anthony', 0.12411964553586782), ('CEO Anthony', 0.12462950727635251), ('service', 0.1316357590449064)] assert result == res keywords = [kw[0] for kw in result] th = TextHighlighter(max_ngram_size=3) textHighlighted = th.highlight(text_content, keywords) print(textHighlighted) assert textHighlighted == "<kw>Google</kw> is acquiring <kw>data science</kw> community <kw>Kaggle</kw>. Sources tell us that <kw>Google</kw> is acquiring <kw>Kaggle</kw>, a <kw>platform</kw> that hosts <kw>data science</kw> and <kw>machine learning</kw> competitions. Details about the transaction remain somewhat vague , but given that <kw>Google</kw> is hosting its Cloud Next conference in <kw>San Francisco</kw> this week, the official announcement could come as early as tomorrow. Reached by phone, <kw>Kaggle</kw> co-founder <kw>CEO Anthony Goldbloom</kw> declined to deny that the acquisition is happening. <kw>Google</kw> itself declined 'to comment on rumors'. <kw>Kaggle</kw>, which has about half a million <kw>data</kw> scientists on its <kw>platform</kw>, was founded by Goldbloom and Ben Hamner in 2010. The <kw>service</kw> got an early start and even though it has a few competitors like DrivenData, TopCoder and HackerRank, it has managed to stay well ahead of them by focusing on its specific niche. The <kw>service</kw> is basically the de facto home for running <kw>data science</kw> and <kw>machine learning</kw> competitions. With <kw>Kaggle</kw>, <kw>Google</kw> is buying one of the largest and most active communities for <kw>data</kw> scientists - and with that, it will get increased mindshare in this community, too (though it already has plenty of that thanks to Tensorflow and other projects). <kw>Kaggle</kw> has a bit of a history with <kw>Google</kw>, too, but that's pretty recent. Earlier this month, <kw>Google</kw> and <kw>Kaggle</kw> teamed up to host a $100,000 <kw>machine learning</kw> competition around classifying YouTube videos. That competition had some deep integrations with the <kw>Google</kw> Cloud <kw>Platform</kw>, too. Our understanding is that <kw>Google</kw> will keep the <kw>service</kw> running - likely under its current name. While the acquisition is probably more about Kaggle's community than technology, <kw>Kaggle</kw> did build some interesting tools for hosting its competition and 'kernels', too. On <kw>Kaggle</kw>, kernels are basically the source code for analyzing <kw>data</kw> sets and developers can share this code on the <kw>platform</kw> (the company previously called them 'scripts'). Like similar competition-centric sites, <kw>Kaggle</kw> also runs a job board, too. It's unclear what <kw>Google</kw> will do with that part of the <kw>service</kw>. According to Crunchbase, <kw>Kaggle</kw> raised $12.5 million (though PitchBook says it's $12.75) since its launch in 2010. Investors in <kw>Kaggle</kw> include Index Ventures, SV Angel, Max Levchin, Naval Ravikant, <kw>Google</kw> chief economist Hal Varian, Khosla Ventures and Yuri Milner"
def get_keywords(text): # TODO: experiment to find best parameters """Extracts keywords from given text Extracts keywords from given text using keyword extraction algorithm YAKE. Currently uses basic parameters for algorithm, which can be optimized. For explanation of parameters, see YAKE documentation. :type text: str :param text: to be processed text, most likely created by get_text_for_character() or get_text_for_chapter() :return: extracted keywords as tuple including confidence as float, e.g. ('keyword', 0.042) :rtype: tuple """ max_ngram_size = 1 deduplication_threshold = 0.9 deduplication_algo = "eqm" window_size = 1 num_of_keywords = 20 kw_extractor = yake.KeywordExtractor( lan="en", n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, top=num_of_keywords, features=None, ) # FIXME stopwords don't seem to be working keywords = kw_extractor.extract_keywords(text) return keywords
def test_null_and_blank_example(): pyake = yake.KeywordExtractor() result = pyake.extract_keywords("") assert len(result) == 0 result = pyake.extract_keywords(None) assert len(result) == 0
def test_extraction_with_YAKE(): yake_extractor = yake.KeywordExtractor(lan="el", top=5) while True: input_doc = input() if input_doc == "end": break output = extract_keywords_YAKE(yake_extractor, input_doc) print(output)
def keywordService(text): kw_extractor = yake.KeywordExtractor() keywords = kw_extractor.extract_keywords(text) keyword_priority_list=[] for kw in keywords: keyword_priority_list.append(kw) return keyword_priority_list
def compute_keywords(df): import yake n_keywords = 2 kw_extractor = yake.KeywordExtractor() custom_kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=0.9, top=n_keywords, features=None) keywords = {} for i, dx in df.groupby(df["cluster"]): text = "\n".join(dx["text"].values) kw = [x[0] for x in custom_kw_extractor.extract_keywords(text)] keywords[i] = "; ".join(kw) return keywords
def extract_tags(text): simple_kwextractor = yake.KeywordExtractor() post_keywords = simple_kwextractor.extract_keywords(text) post_keywords = list(set(post_keywords)) sentence_output = "" for word, number in post_keywords[:2]: sentence_output += word + " " return sentence_output
def __init__(self, max_ngram_size=3, window_size=1, **kwargs): super().__init__(**kwargs) self.name = kwargs.get('name', 'Yake') self.max_ngram_size = max_ngram_size self.window_size = window_size self.kw_extractor = yake.KeywordExtractor( n=self.max_ngram_size, windowsSize=self.window_size, )