def groupSuject(array): old_token = '' old_word = '' lp = [] word = Word('', 'subject') text = '' new_array = [] size = len(array) count = 1 for a in array: if a.token == 'subject': if a.token == old_token: text += "_" + a.text else: text = a.text if count == size: new_array.append(Word(text, 'subject')) text = '' else: if old_token == 'subject': new_array.append(Word(text, 'subject')) text = '' new_array.append(a) old_token = a.token count += 1 return new_array
def load_data(data, category): os.chdir(category) for filename in os.listdir(os.getcwd()): if filename.endswith(".wav"): W = Word(category, filename) W.set_mfcc_matrix() data.append(W) os.chdir("..")
def reverse_poss_dict(poss_dict_path): out = dict() with open(poss_dict_path, encoding='utf-8') as f: lines = (l.strip().split('\t') for l in f) for c in lines: for fun in c[2:]: if fun in out: out[fun].append(Word(c[0].lower(), c[1].lower())) else: out[fun] = [Word(c[0].lower(), c[1].lower())] return out
def relatedness(context1, context2, cocs, key_sets): context1 = filter(lambda x : not x == "" and x.relevant(), [ Word(w) for w in context1.lower().split(" ") ]) context1 = [ w.lemma() for w in context1 ] context2 = filter(lambda x : x.relevant(), [ Word(w) for w in context2.lower().split(" ") ]) context2 = [ w.lemma() for w in context2 ] context1 = get_coc(context1) context2 = get_coc(context2) total = 0.0 for i in xrange(len(key_sets)): s1 = dic_similarity(cocs[i], key_sets[i], context1) s2 = dic_similarity(cocs[i], key_sets[i], context2) total += s1 * s2 total /= len(key_sets) return total
def import_wordlist(self): word_list_elmnt = self.driver.find_elements_by_class_name('thing') for word_elmnt in word_list_elmnt: word_a = word_elmnt.find_element(By.XPATH, './/div[3]/div').text # Word A word_b = word_elmnt.find_element(By.XPATH, './/div[4]/div').text # Word B self.wordList.append(Word(word_a, word_b))
def switchDisjuntion(array): new_array = [] for a in array: if a.token == 'disjuntion': new_array.append(Word('v', 'disjuntion')) else: new_array.append(a) return new_array
def switchImpilies(array): new_array = [] for a in array: if a.token == 'implies': new_array.append(Word('->', 'implies')) else: new_array.append(a) return new_array
def switchNegative(array): new_array = [] for a in array: if a.token == 'negative': new_array.append(Word('~', 'negative')) else: new_array.append(a) return new_array
def switchConjuntion(array): new_array = [] for a in array: if a.token == 'conjuntion': new_array.append(Word('^', 'conjuntion')) else: new_array.append(a) return new_array
def read_poss_dict(path): with open(path, encoding='utf-8') as f: # format: # columnist \t NOUN \t columnistFem_N \t columnistMasc_N lines = (l.strip().split('\t') for l in f) return defaultdict( lambda: [], {Word(l[0].lower(), l[1].lower()): l[2:] for l in lines})
def toPL(cls, lp): dict = {} atomics = [] atomic_name = 'P' atomic_num = 1 results = [] for text in lp: array = [] words = text.split(' ') new_array = [] for word in words: token = tokenize(word) array.append(Word(word, token)) array = removeLigations(array) array = positionNegative(array) array = groupSuject(array) array = switchImpilies(array) array = switchDisjuntion(array) array = switchConjuntion(array) array = switchNegative(array) for a in array: if a.token == 'subject': if a.text not in atomics: atomic = atomic_name + str(atomic_num) dict[a.text] = atomic atomic_num += 1 atomics.append(a.text) new_array.append(Word(atomic, 'subject')) else: atomic = dict.get(a.text) new_array.append(Word(atomic, 'subject')) else: new_array.append(a) results.append(" ".join(a.text for a in new_array)) size = len(results) conclusion = results[size - 1] results = results[0:size - 1] return results, conclusion, dict
data_rating = data_rating_train.append(data_rating_test) print('Range of userId is [{}, {}]'.format(data_rating.userId.min(), data_rating.userId.max())) print('Range of itemId is [{}, {}]'.format(data_rating.itemId.min(), data_rating.itemId.max())) # Read the grouping information if args.pretrain_grouping: data_grouping = pd.read_csv(args.data_grouping, sep=",", header=0, names=['friendId', 'tagId', 'score'],engine='python') config['num_friends_pretrain'] = int(data_grouping.friendId.max() + 1) config['num_items_pretrain'] = int(data_grouping.tagId.max() + 1) del data_grouping print ("group data reading finished!") # Process the tweet vocab = Word() tweet = vocab.load_tweets(data_tweet, args.max_seq_len) pad_word = vocab.pad tweet_pad = np.full(shape=(1, args.max_seq_len), fill_value=pad_word, dtype=np.int64) tweet = np.vstack([tweet, tweet_pad]) # config config['num_users'], config['num_items'] = int(data_rating.userId.max() + 1), int(data_rating.itemId.max() + 1) config['user_friends'],config['user_tweets'],config['num_friends'] = load_friends_tweets(args.data_profile) args.tweet = tweet config['args'] = args config['vocab'] = vocab # Specify the exact model model = sys.argv[1] if len(sys.argv) == 2 else "gmf" if args.model.lower() == "gmf":
print('Range of userId is [{}, {}]'.format(data_rating.userId.min(), data_rating.userId.max())) print('Range of itemId is [{}, {}]'.format(data_rating.itemId.min(), data_rating.itemId.max())) print('Range of tweetId is [{}, {}]'.format(data_rating.tweetId.min(), data_rating.tweetId.max())) # Read the grouping information if args.pretrain_grouping: data_grouping = pd.read_csv(args.data_grouping, sep=",", header=0, names=['friendId', 'tagId', 'score'],engine='python') config['num_friends_pretrain'] = int(data_grouping.friendId.max() + 1) config['num_items_pretrain'] = int(data_grouping.tagId.max() + 1) del data_grouping args.item_num = int(data_rating.itemId.max() + 1) # Process the tweet vocab = Word() tweet = vocab.load_tweets(data_tweet, max_len=200) # Read the grouping information data_grouping = pd.read_csv(args.data_grouping, sep=",", header=0, names=['friendId', 'tagId', 'score'],engine='python') config['num_friends_pretrain'] = int(data_grouping.friendId.max() + 1) config['num_items_pretrain'] = int(data_grouping.tagId.max() + 1) del data_grouping # config config['num_users'], config['num_items'] = int(data_rating.userId.max() + 1), int(data_rating.itemId.max() + 1) config['user_friends'], config['num_friends'] = load_friends(args.data_friends) args.tweet = tweet config['args'] = args config['vocab'] = vocab
print "Starting..." # initiate empty ratings methodsRating = [] humanRating = [] questions = task.values() jointVocCache = dict() partVoc = set(vectors.keys()) print len(disambiguatedWords), "disambiguated words" done = 0 for i in xrange(len(questions)): question = questions[i] word1 = Word(question['word1']).lemma() word2 = Word(question['word2']).lemma() context1 = [ Word(x).lemma() for x in question['context1'].lower().split(' ') ] context2 = [ Word(x).lemma() for x in question['context2'].lower().split(' ') ] # so we are not using disambiguated words in the context..? context1 = filter(lambda x: x in partVoc, context1) context2 = filter(lambda x: x in partVoc, context2) # set finders to false w1 = False w2 = False
def get_bigrams(tree): for w in tree: dep = Word(w.lemma, w.upostag) head = Word(tree[w.head].lemma, tree[w.head].upostag) yield (dep, head, w.deprel)
task, _ = load_task(taskFilename) questions = task.values() methodsRating = [] humanRating = [] print "Answering", len(task), "questions..." for i in xrange(len(questions)): if i % 100 == 0 and not i == 0: print "\tIteration", i, ": ", spearman(methodsRating, humanRating) question = questions[i] word1 = Word(question['word1']).lemma() word2 = Word(question['word2']).lemma() context1 = question['context1'] + " " + question['word1'] + " " + question['word1'] context2 = question['context2'] + " " + question['word2'] + " " + question['word2'] r_s = relatedness(context1, context2, newD, key_sets) v_s = vector_similarity(r_s, word1, word2, vectors) score = r_s * v_s**2 methodsRating.append( score ) humanRating.append(question['rating']) print print spearman(methodsRating, humanRating)