def scrape (visited,vocab) : i=0 for url in visited: try: response = urlopen("http://"+url) print("scraping",i) i+=1 except: continue base = [urlparse(u).netloc for u in url] bs = BeautifulSoup(response,'html.parser') try: title = bs.find('title').text except : continue tags=['p','span','h1','h2','h3','h4','h5','h6','div'] if(title): content=title else: content='' for tag in tags: text_tag=bs.find_all(tag) textContent=[x.text for x in text_tag] content +=' '.join(textContent) page_content[url] = {'data':content} content = re.sub('\n',' ',content) tokens = process.tokenizer_fun(content) cleaned = process.remove_stopwords(tokens) stemmed = process.stemming(cleaned) cleaned2 = process.remove_stopwords(stemmed) cleaned_text = process.length2(cleaned2) word_count[url] = {} v_flag = True for token in cleaned_text: if token not in vocab: vocab[token] = 1 elif v_flag: vocab[token] += 1 v_flag = False if token in word_count[url].keys(): word_count[url][token] += 1 else: word_count[url][token] = 1 links = [urljoin(url, l.get('href')) for l in bs.findAll('a')] links = [l.rstrip("/") for l in links if urlparse(l).netloc in base] finalData = (url,cleaned_text,list(set(links))) if finalData != (-1) : crawler_tuple[url] = finalData return crawler_tuple
def one_classifier(text, lang, embedding_name, model_path, model_file): #-------------------------------------------------------------------------------------------- #--- LOAD MODEL AND EMBEDDING #-------------------------------------------------------------------------------------------- print(model_file) cls = pickle.load(open(model_path + model_file, 'rb')) embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- PROCESSING #-------------------------------------------------------------------------------------------- processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) vectorized_text2 = np.reshape(vectorized_text, (1, -1)) prob = cls.predict_proba(vectorized_text2)[:, 1] else: vectorized_text = np.zeros((300, ) * 1) prob = 0 #print(cls.classes_) # check that class at second position is L1 for i in list(prob): return (i)
def processTitle(title): # print('Title before', title) title = title.lower() title = tokenise(title) title = remove_stopwords(title) title = stem(title) # print('Title: ', title) return title
def processBody(text): # print('Body: ',text) data = re.sub(r'\{\{.*\}\}', r' ', text) data = tokenise(data) data = remove_stopwords(data) data = stem(data) # print('Body: ',data) return data
def processLinks(text): data = text.split('\n') links = [] for line in data: if re.match(r'\*[\ ]*\[', line): links.append(line) data = tokenise(' '.join(links)) data = remove_stopwords(data) data = stem(data) # print('Links: ', ) return data
def processCategories(text): data = text.split('\n') categories = [] for line in data: if re.match(r'\[\[category', line): categories.append(re.sub(r'\[\[category:(.*)\]\]', r'\1', line)) data = tokenise(' '.join(categories)) data = remove_stopwords(data) data = stem(data) # print('Categories: ', data) return data
def full_preprocessing(self): """General preprocessing on document sample. This method include: remove punctuation ( . and , are kept), remove english stop words, tokenize to sentences, words and list of tokenized sentences to words.""" self.text = pre.remove_punctuation(self.text) self.text = pre.to_lowercase(self.text) self.words = pre.tokenize_to_words(self.text) self.words = pre.remove_stopwords(self.words) self.text = ' '.join(self.words) self.sentences = pre.tokenize_to_sentences(self.text) self.normalized_sample = [pre.tokenize_to_words(sent) for sent in self.sentences] return self.sentences
def __init__(self,docs,num_clu): self.no_clusters = num_clu #self.sentences = preprocessing.load_sentences(docs) self.sentences = preprocessing.load_duc_xml(docs) self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) #self.full_doc = helper.fulldoc(self.sentences) #self.sent_no_swords.append(self.full_doc) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight) self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters)
def tokenize_corpus(corpus_file): raw_document_ko = read_txt(corpus_file) lines = raw_document_ko.split('\n') processed_doc_ko = remove_stopwords( remove_extraneous(remove_english(remove_extraneous(lines)))) doc_ko = ' '.join(str(word) for line in processed_doc_ko for word in line) t = Okt() tokens_ko = t.morphs(doc_ko) return tokens_ko
def make_vectorize(): try: #Load the data data = request.get_json() except Exception as e: raise e if data == {}: return (bad_request()) else: #Get the text and the language try: lang = data['lang'] except: try: lang = detect_language(data['text']) print(lang) except: responses = jsonify( "Error in vectorize: language field is missing") return responses try: text = data['text'] except: responses = jsonify("Error in vectorize: text is missing") return responses if lang not in ['en', 'es', 'ar', 'ro', 'fr']: responses = jsonify( "Language not available. Language must be in ['en','es','ar','ro','fr']" ) return responses #Preprocess the text print("Vectorize...") embeddings = Embeddings(emb_dict[lang]) processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) vectorized_tokens = to_vector_single_nonzeros(no_stpw_text, embeddings, len(no_stpw_text)) if len(vectorized_tokens) > 0: vectorized_text = np.mean(vectorized_tokens, axis=0) else: vectorized_text = np.zeros((300, ) * 1) print(vectorized_text) #Send the response codes responses = jsonify(vector=vectorized_text.tolist()) responses.status_code = 200 return responses
def __init__(self,docs,num_clu): self.no_clusters = num_clu print "Loading Sentences..." self.sentences = preprocessing.load_sentences(docs) print "Preprocessing..." self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords,self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) print "Finding Similarity Graph..." self.sent_similarity = helper.similarity(self.sent_weight,self.sent_weight) print "Clustering..." self.clusters = cluster.kmedoid(self.sent_similarity,self.no_clusters) '''
def __init__(self, docs, num_clu): self.no_clusters = num_clu print "Loading Sentences..." self.sentences = preprocessing.load_sentences(docs) print "Preprocessing..." self.sent_no_swords = preprocessing.remove_stopwords(self.sentences) self.unique_terms = helper.uniqueterms(self.sent_no_swords) self.sent_weight = helper.tfisf(self.sent_no_swords, self.unique_terms) #self.sent_weight = helper.word_vector(self.sent_no_swords,self.unique_terms) print "Finding Similarity Graph..." self.sent_similarity = helper.similarity(self.sent_weight, self.sent_weight) print "Clustering..." self.clusters = cluster.kmedoid(self.sent_similarity, self.no_clusters) '''
def get_topic_keywords(qnas, embedding_model=None): """Define the topic keywords. Args: qnas (list): List of questions and answers. embedding_model (wordembedding.WordEmbedding): Word Embedding model. Return: str: Topic file content. """ answers_kwords = list() questions_kwords = list() similar_answers_kwords = list() similar_questions_kwords = list() for question, answer in qnas: question_kwords = list() # Obtain keywords aux = re.sub(r'((\*~\d+)|(\[.*?\]))', ' ', question) # Split with two spaces to preserve words pairs for q in aux.split(' '): word = q.strip() if ' ' in word: question_kwords.append('\"{}\"'.format(word)) elif word: question_kwords.append(word) questions_kwords.extend(question_kwords) answer_no_sw = preprocessing.remove_stopwords(answer) answers_kwords.extend(find_keywords.find_entities(answer_no_sw)) if embedding_model is not None: for word in questions_kwords: word_similars = embedding_model.get_similar(word, top_n=2) similar_questions_kwords.extend(word_similars) for word in answers_kwords: word_similars = embedding_model.get_similar(word, top_n=2) similar_answers_kwords.extend(word_similars) result = set(questions_kwords + similar_questions_kwords # + answers_kwords + similar_answers_kwords ) return result
def analyze(text, lang, registry): topics_path = registry['topics']["topics_path"] patterns_path = registry["key_ideas"]["patterns_path"] processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) tagger = Tagger(lang, registry['pos_models']) pos = tagger.pos_tag(processed_text) concepts = get_concepts(pos, lang) key_ideas = get_key_ideas(pos, lang, patterns_path) topics = get_topics(no_stpw_text, lang, topics_path) result = [concepts, key_ideas, topics] #return {"concepts": concepts, "key_ideas": key_ideas, "topics": topics} return result
def processInfo(text): data = text.split('\n') flag = -1 info = [] st = "}}" for line in data: if re.match(r'\{\{infobox', line): info.append(re.sub(r'\{\{infobox(.*)', r'\1', line)) flag = 0 elif flag == 0: if line == st: flag = -1 continue info.append(line) data = tokenise(' '.join(info)) data = remove_stopwords(data) data = stem(data) # print("Info: ", data) return data
def process_data( self, data_id: int, data: str, add_document: bool ) -> Optional[dict[str, float]]: """ Preprocesses and processes a document. :param data_id: The document's ID. :param data: The content of the document. :param add_document: Whether the document should immediately be added to the TF.IDF collection. :return: The TF.IDF scores of each term in the document, unless the document was added to the collection. In that case nothing is returned. """ preprocessed = remove_stopwords(lemmatize(split_text(data))) if not add_document: return self.tfidf.process_document(preprocessed) else: index = self.tfidf.add_document(preprocessed) self.data_ids[data_id] = index
def create_pre_calculated_result_csv(start, end, csv_name): essays_scores = pd.read_csv('essays_and_scores.csv', encoding="ISO-8859-1") essays_scores = essays_scores.iloc[start:end, :] essays = essays_scores['essay'].values scores1 = essays_scores['rater1_domain1'].values scores2 = essays_scores['rater2_domain1'].values sentence_counts = sentence.find_counts(essays) words_without_stopwords = preprocessing.remove_stopwords(essays) tf_idf_values = vectorization.find_word_vector(words_without_stopwords) dataset = combine_lists(sentence_counts, words_without_stopwords, tf_idf_values, scores1, scores2) # Create dataframe for Random Forest Algorithm df = DataFrame(dataset, columns=['sentence_count', 'english_word', 'non_english_word', 'CC', 'DT-PDT', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'RB2', 'RBR', 'RBS', 'VB', 'VBD-VBN', 'VBG', 'VBP-VBZ', 'other_tags', 'word_count', 'td_idf', 'unique', 'score', 'essay_wo_stopwords']) df.to_csv(csv_name, index=False)
def two_classifier(text, lang, embedding_name, model_path, model_file_JIH, model_file_EXR): #-------------------------------------------------------------------------------------------- #--- LOAD MODEL AND EMBEDDING #-------------------------------------------------------------------------------------------- cls_JIH = pickle.load(open(model_path + model_file_JIH, 'rb')) cls_EXR = pickle.load(open(model_path + model_file_EXR, 'rb')) embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- PROCESSING #-------------------------------------------------------------------------------------------- processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) vectorized_text2 = np.reshape(vectorized_text, (1, -1)) prob_JIH = cls_JIH.predict_proba(vectorized_text2)[:, 1] prob_EXR = cls_EXR.predict_proba(vectorized_text2)[:, 1] else: vectorized_text = np.zeros((300, ) * 1) prob_JIH = 0 prob_EXR = 0 if prob_JIH > prob_EXR: prob = prob_JIH else: prob = prob_EXR for i in list(prob): return (i)
def process_query(query): cleaned_query = pro.tokenizer_fun(query) cleaned_query = pro.remove_stopwords(cleaned_query) cleaned_query = pro.stemming(cleaned_query) return cleaned_query
def testStripStopwords(self): self.assertEqual(remove_stopwords("the world is square"), " world square") self.assertEqual(remove_stopwords(u"一般使用的单位是人数或居住的人口数。"), u"使用单位人数居住人口数。")
def begin_search(): f = open('./inverted_index/fileNumber.txt', 'r') global number_of_files number_of_files = int(f.read().strip()) f.close() query_file = sys.argv[1] with open(query_file, 'r') as q: queries = q.readlines() data = "" for query in queries: global K K = query.split(', ')[0] K = int(K) query = query.split(', ')[1:] temp_query = '' for i in query: temp_query += i + ' ' query = temp_query query = query.lower() start = timeit.default_timer() if re.match(r'[t|b|i|c|l]:', query): tempFields = re.findall(r'([t|b|c|i|l]):', query) words = re.findall(r'[t|b|c|i|l]:([^:]*)(?!\S)', query) # print(tempFields, words) fields, tokens = [], [] si = len(words) i = 0 while i < si: for word in words[i].split(): fields.append(tempFields[i]) tokens.append(word) i += 1 tokens = remove_stopwords(tokens) tokens = stem(tokens) # print(fields, tokens) results = field_query_ranking(tokens, fields) # print(results) else: tokens = tokenise(query) tokens = remove_stopwords(tokens) tokens = stem(tokens) results = simple_query_ranking(tokens) # print(results) if len(results) > 0: results = sorted(results, key=results.get, reverse=True) if (len(results) > K): results = results[:K] for key in results: key.rstrip() title, title_doc_num = find_title(key) data += title_doc_num data += ', ' # print(title_doc_num, end = ' ') if title is not None: for i in title: data += i + ' ' # print(i, end = ' ') data = data[:-1] else: data += "No results found! Try modifying the search by reducing the length maybe?\n" end = timeit.default_timer() data += str(end - start) + ', ' data += str((end - start) / K) data += '\n\n' # print('\n') # print('data', data) with open('queries_op.txt', 'w') as f: f.write(data)
def preprocess(self): l_0=pre.tokenize_tweet(self.txt) l_1=pre.remove_stopwords(l_0) #self.term=pre.stemming(l_1) self.term=l_1
def predict(essay, selected_topic): if selected_topic == 'Computer': dataframe = pd.read_csv('result.csv', encoding="ISO-8859-1") elif selected_topic == 'Library': dataframe = pd.read_csv('library_result.csv', encoding="ISO-8859-1") elif selected_topic == 'Cyclist': dataframe = pd.read_csv('cyclist_result.csv', encoding="ISO-8859-1") essays_without_stopwords = dataframe.iloc[:, 26].values essay = [essay] essay_sentence_counts = sentence.find_counts(essay) essay_words_without_stopwords = preprocessing.remove_stopwords(essay) essay_words_without_stopwords_count = len(essay_words_without_stopwords[0]) essay_without_stopwords = ' '.join(essay_words_without_stopwords[0]) essays_without_stopwords = np.append(essays_without_stopwords, essay_without_stopwords) tf_idf_scores = vectorization.find_word_vector_v2(essays_without_stopwords) for i in range(len(dataframe)): dataframe.iloc[i, 23] = tf_idf_scores[i][0] essay_data = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] for i in range(len(essay_sentence_counts[0])): essay_data[i] = essay_sentence_counts[0][i] essay_data[22] = essay_words_without_stopwords_count essay_data[23] = tf_idf_scores[-1][0] essay_data[24] = tf_idf_scores[-1][1] essay_data = [np.array(essay_data)] """ Random Forest Algorithm """ # Split train and test sets X = dataframe.iloc[:, 0:25].values y = dataframe.iloc[:, 25].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Scale values sc = StandardScaler() X_train = sc.fit_transform(X_train) essay_test = sc.transform(essay_data) # Run Random Forest regressor = RandomForestRegressor(n_estimators=100, random_state=0) regressor.fit(X_train, y_train) essay_score_pred = regressor.predict(essay_test)[0] ''' End of the Random Forest ''' # Predict Topic essay_topic_pred = topic.find_topic(essay_words_without_stopwords) if selected_topic != str(essay_topic_pred).capitalize(): print( 'Selected topic and predicted topic did not match. Are you sure you selected right topic?' ) print('Predicted Topic: ', str(essay_topic_pred).capitalize()) essay_score_pred *= 0.6 print('Predicted Score: ', essay_score_pred)
def classifier(annotated_data, lang, user_id, case_id, clas_name): #-------------------------------------------------------------------------------------------- #--- DEFINE FILES AND LANGUAGE #-------------------------------------------------------------------------------------------- model_path = './data/probability/insikt/' model_file = user_id + '_' + case_id + '_' + clas_name + '_classifier.model' if (lang == 'en'): embedding_name = 'embedding-EN' if (lang == 'ar'): embedding_name = 'embedding-AR' if (lang == 'es'): embedding_name = 'embedding-ES' if (lang == 'ro'): embedding_name = 'embedding-RO' if (lang == 'fr'): embedding_name = 'embedding-FR' embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- GENERAL SCRIPT #-------------------------------------------------------------------------------------------- ########## Tokenize + stopwords #print(annotated_data) #raw_data=np.array(annotated_data) x_train = [i[0] for i in annotated_data] #print(x_train) y_train = [i[1] for i in annotated_data] #replace N0 for L0...!!! #print(y_train) x_train_DL = [] print('Data training with ' + str(len(x_train)) + ' texts') for text in x_train: #print(text) processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) else: vectorized_text = np.zeros((300, ) * 1) #print(vectorized_text) #x_train_DL.append(np.reshape(vectorized_text,(1,-1))) x_train_DL.append(vectorized_text) ########## Build and test classifiers with 10-fold -cross validation skf = StratifiedKFold(n_splits=10, shuffle=True) # Stochastic Descent Gradient cls = SGDClassifier(loss="log", penalty="l2", max_iter=500).fit(x_train_DL, y_train) scores = cross_val_score(cls, x_train_DL, y_train, cv=skf, scoring='accuracy') print("Accuracy C-10V EN: %2.1f (+/- %2.1f)" % (100 * scores.mean(), scores.std() * 200)) print(cls.classes_) # check that class at the second position is 'Yes' accuracy = round((100 * scores.mean()), 2) ########## Save the model pickle.dump(cls, open(model_path + model_file, 'wb')) return (accuracy)
from gensim.models.word2vec import FAST_VERSION FAST_VERSION=1 import sys sys.path.append('../lib/') file=[] path=r'rvm.txt' # path to file for string in open(path,'r',encoding='cp1251'): file.append(string.lower()) file_split=split_file(file) text=clean_text([file_split[i][0] for i in range(len(file_split))]) # remove symbols in text clear_text=remove_stopwords(text) # remove stop-words in text s=func_lemma(func_container(clear_text)) # lemmatization procedure w=func_tokenize(s) # w train dataset after preprocessing procedure path=r'lenta-ru-news.csv' # path to test dataset df = pd.read_csv(path,engine='python', delimiter=',',encoding = "utf-8-sig") # plot topic news distribution y_pos=np.arange(len(df['topic'].value_counts())) performance=df['topic'].value_counts() plt.figure(figsize=(8,6)) plt.bar(y_pos,performance,align='center',alpha=0.5,color='g',width=0.8) plt.xticks(y_pos,df['topic'].value_counts().index.tolist(),rotation=90,size=15) plt.yticks(size=15) plt.xlabel('Topics',size=15)