def get_wikipedia_word_ranked_list(): """ create and save two dictionaries: word to rank, and word to count """ wordcount = {} filename = '<english wikipedia dump location>' with open(filename, 'r') as fin: for line in fin: for token in line.split(): count = wordcount.get(token, 0) wordcount[token] = count + 1 # end for # end for sorted_wordcount = sorted(wordcount, key=wordcount.get, reverse=True) ranks = {} for count, key in enumerate(sorted_wordcount): if count > 500000: continue ranks[key] = count # end for # end with Serialization.save_obj(wordcount, 'dict.counts.cs') Serialization.save_obj(ranks, 'dict.ranks.cs')
def load_data(filename, common_users): """ generates a dictionary of user to all their (cs or monolingual) posts :param filename: csv file with user posts :param common_users: a list of users who have both cs and monolingual texts :return: user to posts map """ texts = {} with open(filename, 'r') as fin: print('reading', filename) csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() for line in csv_reader: if len(line) < 8: continue if len(line[7].split()) < MIN_SENTENCE_LENGTH: continue author = line[0].strip() if author not in common_users: continue text_by_author = texts.get(author, []) text_by_author.append(' '.join( word_tokenize(line[7].strip().lower()))) texts[author] = text_by_author # end for # end with object_name = '<cs or monolingual texts by author>' Serialization.save_obj(texts, object_name) return texts
def filter_out_non_english_posts(dataobject): """ given a list of posts, filter in clean monolingual english posts :param dataobject: user to posts object :return: user to posts clean dictionary """ clean_data = {} data = Serialization.load_obj(dataobject) for author in data: print('processing:', author) author_eng_posts = [] for post in data[author]: sentences = [] for sentence in re.split('\.|\! |\? |\n', post): if len(sentence.split()) < 10: continue try: detector = Detector(sentence) except: continue if detector.languages[0].name == 'English' and \ detector.languages[0].confidence > DETECTOR_CONFIDENCE: sentences.append(sentence) # end if # end for if len(sentences) == 0: continue author_eng_posts.append('. '.join(sentences)) # end for if len(author_eng_posts) == 0: continue clean_data[author] = author_eng_posts # end for Serialization.save_obj(clean_data, dataobject+'.clean') for author in clean_data: print(author, len(clean_data[author]))
def get_embeddings(data, title): try: embeddings = Serialization.load_obj(title) except FileNotFoundError: embeddings = model.encode(data, show_progress_bar=True) Serialization.save_obj(embeddings, title) return embeddings
def extract_proficiency_metrics(objname): """ extract lexical and grammatical proficiency metrics given user to posts data :param objname: pickle object with user to posts data :return: """ metrics = {} data = Serialization.load_obj(objname) for author in data: if len(data[author]) < MIN_POSTS_FOR_TEST: continue metrics[author] = Proficiency.compute_lexical_metrics(data[author]) metrics[author].extend(Proficiency.compute_grammatical_metrics(data[author])) print(author, metrics[author]); sys.stdout.flush() # end for Serialization.save_obj(metrics, objname.replace('data', 'metrics.lex.gramm.clean')) print(len(metrics))
def load_data(file_cs, file_monolingual): """ loads posts by code-switchers and noncode-switchers :param file_cs: a csv file with posts by frequent code-switching users :param file_monolingual: a csv file with posts by user who don't (or very rarely) code-switch :return: """ data_cs, subreddits_cs = DataProcessing.read_data(file_cs) data_monolingual, subreddits_monolingual = DataProcessing.read_data(file_monolingual) subreddits = subreddits_cs subreddits.extend(subreddits_monolingual) subreddits = set(subreddits) Serialization.save_obj(data_cs, DATA_CS) Serialization.save_obj(data_monolingual, DATA_MONOLINGUAL) print('code-switchers:', len(data_cs), 'non-code-switchers:', len(data_monolingual)) print('total subreddits:', len(subreddits))
def init_vad(): df_vad = pd.read_csv('/ais/hal9000/jai/lexicon.txt', delimiter='\t', header=0) df_vad = df_vad.dropna().reset_index(drop=True) df = df_vad[['Word', 'Valence']] valence = np.array(df['Valence'].tolist()) vad_words = list(df_vad['Word']) vad_embeddings = LexicalAnalysis.get_embeddings(vad_words, "vad") print("LOADING VALENCE MODEL") try: valence_model = Serialization.load_obj('valence_model') except FileNotFoundError: valence_model = LexicalAnalysis.fit_beta_reg( valence, vad_embeddings, df, 'v_group') Serialization.save_obj(valence_model, 'valence_model') LexicalAnalysis.goodness_of_fit(valence_model, valence, vad_embeddings) return valence_model
def lemmatization_and_pos_filter(filename, common_users): """ preprocessing data towards topic modeling :param filename: a csv file with code-switched or monolingual data :param common_users: a list of user with both types of posts """ stop_words = stopwords.words('english') with open(filename, 'r') as fin: csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() data = [] for line in csv_reader: if len(line) < 8: continue if len(line[7].split()) < MIN_SENTENCE_LENGTH: continue if line[0].strip() not in common_users: continue data.append(line[7]) # end for # end with print('total of', len(data), 'posts') tokens = sum([len(post.split()) for post in data]) print('average post length', float(tokens) / len(data)) print('converting posts to words...') data_words = list(Utils.post_to_words(data)) print('skipping (performing) lemmatization and pos filtering...') data_words = Utils.lemmatization(data_words) print('removing stopwords and unfrequent words...') ranks = Serialization.load_obj('dict.ranks') data_words = Utils.remove_noncontent_words(data_words, stop_words, ranks) Serialization.save_obj(data_words, current_mode + '.preprocessed')
def extract_users_common_set(): """ extract a set of user with both code-switched and english monolingual posts """ users_cs = [] filename = '<a csv file with code-switched posts>' with open(filename, 'r') as fin: csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() for line in csv_reader: if len(line) < 8: continue users_cs.append(line[0].strip()) # end for # end with users_non_cs = [] filename = '<a csv file with monolingual enlgish posts>' with open(filename, 'r') as fin: csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() for line in csv_reader: if len(line) < 8: continue users_non_cs.append(line[0].strip()) # end for # end with common_users = set(users_cs).intersection(set(users_non_cs)) print('total cs users, monolingual users, common users:', len(set(users_cs)), len(set(users_non_cs)), len(common_users)) Serialization.save_obj(common_users, 'common.users')
def test_formality_difference(): """ extracts two lists of per-user (in)formality markers frequency and performs Wilcoxon pair-wise significance test for difference """ markers = Formality.load_formality_markers() cs_object_name = '<pickle object with map: author to cs texts>' non_cs_object_name = '<pickle object with map: author to monolingual english texts>' cs_texts = Serialization.load_obj(cs_object_name) non_cs_texts = Serialization.load_obj(non_cs_object_name) print('loaded', len(cs_texts), 'and', len(non_cs_texts), 'cs and monolingual english by authors') for author in cs_texts: cs_texts[author] = ' '.join(cs_texts[author]) for author in non_cs_texts: non_cs_texts[author] = ' '.join(non_cs_texts[author]) cs_markers_by_authors, non_cs_markers_by_authors = Formality.extract_markers( cs_texts, non_cs_texts, markers) #print(cs_markers_by_authors, non_cs_markers_by_authors) print('mean markers frequency in cs:', np.mean(cs_markers_by_authors), 'in non-cs:', np.mean(non_cs_markers_by_authors)) Serialization.save_obj(cs_markers_by_authors, 'formality.markers.cs') Serialization.save_obj(non_cs_markers_by_authors, 'formality.markers.non-cs') stat, pval = wilcoxon(cs_markers_by_authors, non_cs_markers_by_authors) print('paired ttest sig test pval:', pval, stat) mean1 = np.mean(cs_markers_by_authors) mean2 = np.mean(non_cs_markers_by_authors) std1 = np.std(cs_markers_by_authors) std2 = np.std(non_cs_markers_by_authors) r1, _ = spearmanr(cs_markers_by_authors, non_cs_markers_by_authors) r2, _ = pearsonr(cs_markers_by_authors, non_cs_markers_by_authors) print(mean1, mean2, std1, std2, r1, r2)
def topical_differences_sig_analysis(): """ testing code-switching and monolingual english posts for topical differences (1) partition code-switched posts into two random sets (2) perform topic modeling of each partition and compute the similarity between the two parts and their individual similarity to topics extracted from monolingual posts (3) test the multiple-experiment similarity scores for significance """ data_object_name = 'monolingual.preprocessed' data_words = Serialization.load_obj(data_object_name) stop_words = stopwords.words('english') print('removing stopwords and infrequent words...') ranks = Serialization.load_obj('dict.ranks') data_words = Utils.remove_noncontent_words(data_words, stop_words, ranks) print('after pre-processing: total of', len(data_words), 'posts') topics = MONOLINGUAL_TOPICS for i in range(EXPERIMENTS): shuffle(data_words) part1 = data_words[:math.floor(len(data_words) / 2)] part2 = data_words[math.floor(len(data_words) / 2):] model = Utils.model_topic(part1, topics) Serialization.save_obj(model, 'lda.mallet.monolingual.part1.' + str(i)) print('saved topic model: part1,', i) model = Utils.model_topic(part2, topics) Serialization.save_obj(model, 'lda.mallet.monolingual.part2.' + str(i)) print('saved topic model: part2,', i) sys.stdout.flush() # end for inter = [] intra = [] ldamodel_cs = malletmodel2ldamodel( Serialization.load_obj('lda.mallet.cs')) for i in range(30): print('processing', i) ldamodel_mono1 = malletmodel2ldamodel( Serialization.load_obj('lda.mallet.monolingual.part1.' + str(i))) ldamodel_mono2 = malletmodel2ldamodel( Serialization.load_obj('lda.mallet.monolingual.part2.' + str(i))) diff_matrix1, _ = ldamodel_cs.diff(ldamodel_mono1, distance='jaccard') diff_matrix2, _ = ldamodel_cs.diff(ldamodel_mono2, distance='jaccard') #intra.append(np.mean([np.mean(np.matrix(diff_matrix1)), np.mean(np.matrix(diff_matrix2))])) intra.append( np.mean([ np.min(np.matrix(diff_matrix1)), np.min(np.matrix(diff_matrix2)) ])) diff_matrix3, _ = ldamodel_mono1.diff(ldamodel_mono2, distance='jaccard') #inter.append(np.mean(np.matrix(diff_matrix3))) inter.append(np.min(np.matrix(diff_matrix3))) # end for print(np.mean(intra), np.mean(inter)) _, pval = ranksums(intra, inter) print('pval:', pval)