def textrank_labeling(apply_sub_clustering=False): """Labeling using textrank""" from summa import keywords_custom as keywords clusters = list(get_clusters()) del clusters[-1] if apply_sub_clustering: clusters_labels = [] for cluster in clusters: sub_clusters = sub_clustering(cluster) sub_clusters_size_ratio = [(len(cl) / len(cluster)) for cl in sub_clusters] text_sub_clusters = [" ".join(cl) for cl in sub_clusters] sub_clusters_top_terms = [] for text_sub_cluster in text_sub_clusters: terms = keywords.keywords(text_sub_cluster) terms = { utils.stem(term): weight for term, weight in terms.items() } terms = [(term, weight) for term, weight in terms.items()] sub_clusters_top_terms.append(terms[:10]) terms = __combine_terms(sub_clusters_top_terms, sub_clusters_size_ratio) clusters_labels.append(get_labels(cluster, terms)) else: text_clusters = [" ".join(cluster) for cluster in clusters] clusters_top_terms = [] for text_cluster in text_clusters: terms = keywords.keywords(text_cluster) terms = { utils.stem(term): weight for term, weight in terms.items() } terms = [(term, weight) for term, weight in terms.items()] clusters_top_terms.append(terms[:10]) clusters_labels = [ get_labels(cluster, clusters_top_terms[cluster_index]) for cluster_index, cluster in enumerate(clusters) ] print_result(clusters_labels) #print(clusters_labels) print( evaluate_clusters([[label[0] for label in labels] for labels in clusters_labels]))
def flatten_projects(paths, outfn): def dursum(p): return sum(p.durations[j] for j in p.J) + 1 maxT = max(dursum(project.load_project(path)) for path in paths) with open(outfn, 'w') as fp: value_count = len(project.flatten_project(paths[0], maxT)) fp.write('instance;' + ';'.join(['v' + str(ix) for ix in range(value_count)]) + '\n') for path in paths: instanceName = utils.stem(path) fp.write(instanceName + ';' + ';'.join(project.flatten_project(path, maxT)) + '\n')
def norm(df: pd.DataFrame) -> pd.Series: ddf = df.copy(deep=False) lemmatizer = ns.WordNetLemmatizer() topics = [] for t_item, o_item in zip(ddf["topics"], ddf["origin"]): # author topics first select if o_item == "ieee": if "Author" in t_item: ts = re.split(",", re.split(":", t_item)[-1]) elif "IEEE" in t_item: ts = re.split( ",", re.search("IEEE Keywords:(.*?);", t_item).groups()[0]) else: try: ts = re.split( ",", re.search("INSPEC: Controlled Indexing:(.*?);", t_item).groups()[0]) except: ts = re.split(",", str(t_item)) else: ts = re.split(",", str(t_item)) # topic of one paper process ts = hero.remove_html_tags(hero.lowercase(pd.Series(ts))) topic = [] for t in ts: t.replace(" - ", "-") if len(re.split("and", t)) == 2 and "-" not in t: topic += re.split("and", t) continue if len(re.split("/", t)) == 2: topic += re.split("/", t) continue if "blockchain" in t and len(re.split(" ", t)) >= 2: t = re.split(" ", t)[-1] if t != "": topic.append(t.replace("\xa0", "")) topics.append(",".join([ similar_replace(stem(remove_chore(t), lemmatizer)) for t in topic ])) # topics.append(",".join([stem(remove_chore(t), lemmatizer) for t in topic])) return pd.Series(topics)
def generate_interests_keywords(self, userid): # Initialise ref_topics self.get_ref_topics() # Get top-10 user topics user_topics = get_topics.get_topics(userid) # Get top-20 keywords top20_keywords = [] for i in range(len(user_topics)): for j in range(0, 2): top20_keywords.append(utils.stem(user_topics[i][1][j][0])) # Fit top-20 keywords in ref_topics to get interests keywords for i in range(0, len(top20_keywords)): keyword = top20_keywords[i] for j in range(0, len(self.ref_topics)): if keyword in self.ref_topics[j][1]: self.interests.append(self.ref_topics[j]) return self.interests
# print(intents) all_words = [] tags = [] xy = [] for intent in intents['intent']: tag = intents['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) # use extend instead of append as we don;t want array of arrays xy.append((w,tag)) ignore_words = ['?','!','[',']','.',','] all_words = [stem(w) for w in all_words if w not in ignore_words] all_words = sorted(set(all_words)) tags = sorted(set(tags)) X_train = [] y_train = [] for (sen, tag) in xy: bag = bow(sen,all_words) X_train.append(bag) # multiclass label labels = tags.index(tag) y_train.append(labels)
(mfcc_deltas_deltas_transposed, mfcc_deltas_deltas_transposed_mean) if __name__ == '__main__': num_mfcc = 13 use_deltas = True (sample_rate, signal) = wavfile.read("speakers/russian/female/anonymous104/ru_0036.wav") samples_without_pauses = remove_pauses(sample_rate, normalize_signal(signal)) mfcc_features1 = get_mfcc_features(sample_rate, samples_without_pauses, num_mfcc, use_deltas) (sample_rate, signal) = wavfile.read("speakers/russian/female/anonymous104/ru_0037.wav") samples_without_pauses = remove_pauses(sample_rate, normalize_signal(signal)) mfcc_features2 = get_mfcc_features(sample_rate, samples_without_pauses, num_mfcc, use_deltas) # Проверка значений коэффициентов в каких-либо фреймах plt.subplot(2, 1, 1) stem(mfcc_features1[0, :num_mfcc - 1], linefmt='r', markerfmt='ro') stem(mfcc_features1[1, :num_mfcc - 1], linefmt='b', markerfmt='bo') stem(mfcc_features1[20, :num_mfcc - 1], linefmt='y', markerfmt='yo') plt.grid(True) plt.subplot(2, 1, 2) stem(mfcc_features2[0, :num_mfcc - 1], linefmt='r', markerfmt='ro') stem(mfcc_features2[1, :num_mfcc - 1], linefmt='b', markerfmt='bo') stem(mfcc_features2[20, :num_mfcc - 1], linefmt='y', markerfmt='yo') plt.grid(True) plt.show()
def word_is_difficult(word, with_stemming=True): if with_stemming: return stem(word.lower()) not in FAMILIAR_STEMS return word not in FAMILIAR_WORDS
from utils import ( avg_sentence_length_for_doc, doc_from_path, text_files_in_directory, load_nlp, words_for_doc, stem, ) def get_words(path): with open(path) as f: return [line.strip() for line in f.readlines()] familiar_words = get_words('resources/familiar-words.txt') FAMILIAR_WORDS = set(familiar_words) FAMILIAR_STEMS = set([stem(w) for w in familiar_words]) def difficult_word_rate_for_doc(doc): words = words_for_doc(doc) difficult_words = [w for w in words if word_is_difficult(w)] return (len(difficult_words) / len(words)) * 100 def difficult_words_for_doc(doc): return [w for w in words_for_doc(doc) if word_is_difficult(w)] def word_is_difficult(word, with_stemming=True): if with_stemming: return stem(word.lower()) not in FAMILIAR_STEMS return word not in FAMILIAR_WORDS def dc_grade_level(doc):
objs = [] sentences = [] # sentences for training word2vec model ignore = set("for the a an of for on in and to as or : , . ( ) ? !".split(" ")) # word to ignore when parsing research paper title author2paper = defaultdict(list) paper2author = defaultdict(dict) with open('trial_data.json') as f: data = json.load(f) length = len(data) for i, obj in enumerate(data): if i > 50: break processed = [stem(word.lower()) for word in tokenize(obj['title']) if word.lower() not in ignore and not word.isdigit()] obj["processed"] = processed sentences.append(processed) objs.append(obj) # configure paper2author and add processed arr as one of the key val pairs paper2author[obj["title"]]['author'] = obj["author"] paper2author[obj["title"]]['processed'] = processed paper2author[obj["title"]]['similarity'] = [float('inf')] * length paper2author[obj["title"]]['index'] = i for author in obj["author"]: author2paper[author].append(obj["title"]) # train the model and get the feature vector for each paper model = models.Word2Vec(sentences, min_count=1, size=7, window=2)
def process_content(real_content, lang): real_content = utils.strip_accents(real_content) real_content = utils.remove_stopwords(real_content, lang) real_content = utils.stem(real_content, lang) return real_content
#print(intents) all_words = [] tags = [] x_y = [] for inten in intents["intents"]: tag = inten["tag"] tags.append(tag) for pattern in inten["patterns"]: wrd = tokenize(pattern) all_words.extend(wrd) x_y.append((wrd, tag)) # ignore some symbols ignore_sym = ['?', '.', '!', ',', "'", '-'] all_words = [stem(wrd) for wrd in all_words if wrd not in ignore_sym] # remove duplicates and sort all_words = sorted(set(all_words)) tags = sorted(set(tags)) #print(len(x_y), "patterns") #print(len(tags), "tags:", tags) #print(len(all_words), "unique words:", all_words) # Creating data-set X_train = [] Y_train = [] for (ptrn_sent, tag) in x_y: bag = bag_of_words(ptrn_sent, all_words) X_train.append(bag) label = tags.index(tag)
import json import numpy as np from gensim.models import Word2Vec from utils import tokenize, stem, cosineSimilarity model = Word2Vec.load('model.bin') validateTitle = "Robust adaptive single neural control for a class of uncertain nonlinear systems with input nonlinearity" ignore = set("for the a an of for on in and to as or : , . ( ) ? !".split( " ")) # word to ignore when parsing research paper title processed = [ stem(word.lower()) for word in tokenize(validateTitle) if word.lower() not in ignore and not word.isdigit() ] def getFeatureVector(wordList): res = [] for word in wordList: try: res.append(model[word]) except: # if the word has never been seen in the model, then append a numpy array of all zeroes res.append(np.zeros(7, dtype=np.float32) ) # 7 is the model word2vec features hyperparameter return res feature = getFeatureVector(processed) author = ['Bryant Zhou']
import sys import string import re import json import xml.etree.ElementTree as ET import nltk.data from nltk.stem import WordNetLemmatizer import utils content, full_word_dict = utils.stem(utils.split(sys.argv[2])) content = content[1:-1].split(', ') s = utils.getStructure(content) # dictionary res = utils.buildDict(s, full_word_dict) if res: dictionary = json.dumps(res) #output dictionary if sys.argv[1] == "Dictionary": print(dictionary) if dictionary else print("") #output ecosystem/animal if sys.argv[1] == "Topic": if res: print(next(iter(res)).title()) if next(iter(res)) else print("") else: for s in content: words = s.split(' ') for word in words: word = word.translate(str.maketrans('', '',
all_words = [] tags = [] xy = [] for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: w = tokenize(pattern) all_words.extend(w) xy.append((w, tag)) #Stem and lower each word ignored_chars = ['?','!', '.', ','] all_words = [stem(w) for w in all_words if w not in ignored_chars] #Remove duplicates and sort all_words = sorted(set(all_words)) tags = sorted(set(tags)) print(f"{len(xy)} Patterns\n\n{len(tags)} Tags: {tags}") # create training data X_train = [] y_train = [] for (pattern_sentence, tag) in xy: # X: bag of words for each pattern_sentence bag = bag_of_words(pattern_sentence, all_words) X_train.append(bag) # y: PyTorch CrossEntropyLoss needs only class labels, not one-hot label = tags.index(tag)
data = [preprocess(text) for text in data] with open('processed.pkl', 'wb') as f: pickle.dump((data, y), f) keywords = { 0: ['love', 'people', 'time', 'day', 'life'], 1: ['free', 'video', 'join', 'check', 'win'], 2: ['f****d', 'ass', 'bitch', 'bad', 'shit'], 3: ['hate', 'n***a', 'idiot', 'ass', 'trump'] } for class_label, words in keywords.items(): keywords[class_label] = [stem(w) for w in words] # get count features count_vectorizer = CountVectorizer(input='content', encoding='ascii', decode_error='ignore', strip_accents='ascii', stop_words='english', min_df=2) count_weights = count_vectorizer.fit_transform(data) vocabulary = count_vectorizer.vocabulary_ # get tf idf features vectorizer = TfidfVectorizer(input='content', encoding='ascii', decode_error='ignore', strip_accents='ascii', stop_words='english', min_df=2, vocabulary=vocabulary) tfidf_weights = vectorizer.fit_transform(data)
print("Please wait while we are stemming the text ... \n") # Processing of the input text before neighbors finding prog = re.compile("[_\-\(]*([A-Z]\.)*[_\-\(]*") for t in toProcess: terms = toProcess[t].split() #stem(algo,toProcess[t]).split() text = "" for w in terms: if (prog.match(w)): w = w.replace('.', '') text = text + " " + w text = ' '.join(escape(text).split()) text = " ".join(nltk.word_tokenize(text)) d = [] text = " ".join([stem(algo, w) for w in text.split()]) d += text.split() toProcess[t] = d #print(toProcess) if bool(args["--stop"]): stopWordsList = set(stopwords.words('english')) print(stopWordsList) for i in toProcess: listWords = [w for w in toProcess[i] if w not in stopWordsList] toProcess[i] = listWords #print(toProcess) #print(toProcess)
def get_stories(id, type): ui = user_interests.UserInterests() ui.interests = [] interests = ui.generate_interests_keywords(id) interests = ui.generate_interests_keywords(id) ALGOLIA_URL = 'https://hn.algolia.com/api/v1/' if type == 'top': ALGOLIA_URL += 'search?tags=front_page&hitsPerPage=50' elif type == 'new': ALGOLIA_URL += 'search_by_date?tags=story&hitsPerPage=50' elif type == 'show': ALGOLIA_URL += 'search_by_date?tags=show_hn&hitsPerPage=50' elif type == 'ask': ALGOLIA_URL += 'search_by_date?tags=ask_hn&hitsPerPage=50' req = Request(ALGOLIA_URL, headers={'User-Agent': 'Mozilla/5.0'}) data = json.loads(urlopen(req).read().decode('utf8')) stories = [] # Shuffle interests random.shuffle(interests) for i in range(0, len(data['hits'])): title = data['hits'][i]['title'] url = data['hits'][i]['url'] time = data['hits'][i]['created_at'] author = data['hits'][i]['author'] points = data['hits'][i]['points'] comments_count = data['hits'][i]['num_comments'] if url == '': continue if type == 'ask': title = title.replace('Ask HN:', '') if type == 'show': title = title.replace('Show HN:', '') words = gensim.utils.simple_preprocess(str(title), deacc=True) story = {} flag = False for word in words: word = word.lower() for interest in interests: if word in interest[1] or utils.stem(word) in interest[1]: story['title'] = title story['url'] = url story['time'] = time story['author'] = author story['points'] = points story['comments'] = comments_count story['topics'] = interest[0] stories.append(story) flag = True break if flag: break return stories[1:]
with open('intents.json', 'r') as file: intents = json.load(file) all_words = [] tags = [] # all the categories xy = [] # zip processed arrs with tag for intent in intents['intents']: tag = intent['tag'] tags.append(tag) for pattern in intent['patterns']: processed = tokenize(pattern) all_words.extend(processed) xy.append((processed, tag)) ignored = [',', '.', '?', '!'] all_words = [stem(word) for word in all_words if word not in ignored] # stem all the words all_words = sorted(set(all_words)) # get rid of duplicates tags = sorted(set(tags)) # get rid of duplicates X_train = [] y_train = [] for (processed_sentence, tag) in xy: bag = bag_of_words(processed_sentence, all_words) X_train.append(bag) label = tags.index(tag) y_train.append(label) # parameters batch_size = 8