def write_features_to_csv(): lexicon = 'saldo' corpus = 'news' gold_blends = blend_keys() csvf = open(f'{lexicon}_features_noverlap_blends_min1_samplewords.csv', '+w', newline='') csvw = csv.writer(csvf, delimiter=',') T, F = 0, 0 candidate_folder = f'/home/adam/Documents/lexical_blends_project/{lexicon}_blends_candidates_noverlap_1/' for i, filename in enumerate(listdir(candidate_folder)): blend = filename.split('_')[0] print('### reading blend:', i, blend) with open(candidate_folder + filename) as f: for ln in f: cw1, cw2 = ln.rstrip().split(',') sw1, sw2 = gold_blends[blend] feature_set, label = extract_sample_features( blend, cw1, cw2, lexicon, corpus, sw1, sw2) entry = list(map(lambda x: str(x), feature_set.values())) if label == True: T += 1 else: F += 1 csvw.writerow(entry) print(blend, T, F) csvf.close()
def multip_write_features_to_csv(): lexicon = 'saldo' corpus = 'news' gold_blends = blend_keys() wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1' wsm = gs.models.Word2Vec.load(wg_path) cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext' csm = gs.models.Word2Vec.load(cg_path) epit = epitran.Epitran('swe-Latn') csvf = open('{0}_features_overlap_split_020818.csv'.format(lexicon), '+w', newline='') csvw = csv.writer(csvf, delimiter=',') T, F = 0, 0 dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle' with open(dataf, 'rb') as f: freqd = pickle.load(f) # overlap candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/' # noverlap #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/' cand_set = [] for i, filename in enumerate(listdir(candidate_folder)): blend = filename.split('_')[0] #print('#', i ,'reading', blend, 'from', candidate_folder+filename) with open(candidate_folder + filename) as f: for ln in f: cw1, cw2 = ln.rstrip().split(',') if blend in [cw1, cw2]: continue sw1, sw2 = gold_blends[blend] cand_set.append((blend, cw1, cw2, lexicon, corpus, sw1, sw2, freqd, csm, wsm, epit)) for cand_chunk in chunks(cand_set, 10): with Pool(3) as p: entires = p.starmap(extract_sample_features, cand_chunk) print('# writing entries') for entry in entires: for e in entry: csvw.writerow(list(map(lambda x: str(x), e[0].values()))) csvf.close()
def multip_write_features_to_csv(): lexicon = 'saldo' corpus = 'news' gold_blends = blend_keys() csvf = open( '{0}_features_overlap_split_blends_charsim_280718.csv'.format(lexicon), '+w', newline='') csvw = csv.writer(csvf, delimiter=',') T, F = 0, 0 dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle' with open(dataf, 'rb') as f: freqd = pickle.load(f) #candidate_folder = f'/home/adam/Documents/lexical_blends_project/{lexicon}_blend_candidates_1/' candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/' cand_set = [] for i, filename in enumerate(listdir(candidate_folder)): blend = filename.split('_')[0] #print('#', i ,'reading', blend, 'from', candidate_folder+filename) with open(candidate_folder + filename) as f: for ln in f: cw1, cw2 = ln.rstrip().split(',') if blend in [cw1, cw2]: continue sw1, sw2 = gold_blends[blend] cand_set.append( (blend, cw1, cw2, lexicon, corpus, sw1, sw2, freqd)) for cand_chunk in chunks(cand_set, 10): with Pool(3) as p: entires = p.starmap(extract_sample_features, cand_chunk) print('# writing entries') for entry in entires: for e in entry: csvw.writerow(list(map(lambda x: str(x), e[0].values()))) csvf.close()
def write_features_to_csv(): lexicon = 'saldo' corpus = 'news' gold_blends = blend_keys() csvf = open(f'{lexicon}_features_overlap_blends_min1.csv', '+w', newline='') csvw = csv.writer(csvf, delimiter=',') T, F = 0, 0 dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle' with open(dataf, 'rb') as f: freqd = pickle.load(f) candidate_folder = f'/home/adam/Documents/lexical_blends_project/{lexicon}_blend_candidates_1/' for i, filename in enumerate(listdir(candidate_folder)): blend = filename.split('_')[0] print('#', i, 'reading', blend, 'from', candidate_folder + filename) with open(candidate_folder + filename) as f: for ln in f: cw1, cw2 = ln.rstrip().split(',') sw1, sw2 = gold_blends[blend] #print('### blend:', blend, 'gold:', (sw1, sw2), 'sample:', (cw1, cw2)) feature_set = extract_sample_features(blend, cw1, cw2, lexicon, corpus, sw1, sw2, freqd) for features, label in feature_set: if not features: continue if label: T += 1 else: F += 1 entry = list(map(lambda x: str(x), features.values())) csvw.writerow(entry) print(blend, T, F) csvf.close()
def write_features_to_csv(): lexicon = 'saldo' corpus = 'news' gold_blends = blend_keys() wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1' wsm = gs.models.Word2Vec.load(wg_path) cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/cc.sv.300.bin' csm = FastText.load_fasttext_format(cg_path) #cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext' #csm = FastText.load(cg_path) epit = epitran.Epitran('swe-Latn') col_names = [ 'sw1_charemb_score', 'sw2_charemb_score', 'blend_charemb_score', 'sw1_sw2_charemb_sim', 'sw1_blend_charemb_sim', 'sw2_blend_charemb_sim', 'sw1_wordemb_score', 'sw2_wordemb_score', 'blend_wordemb_score', 'sw1_blend_wordemb_sim', 'sw2_blend_wordemb_sim', 'sw1_sw2_wordemb_sim', 'splits', 'sw1_sw2_char_bigramsim', 'sw2_sw1_char_bigramsim', 'sw1_sw2_char_trigramsim', 'sw2_sw1_char_trigramsim', 'lcs_sw1_sw2', 'sw1_blend_IPA_lev_dist', 'sw2_blend_IPA_lev_dist', 'sw1_sw2_IPA_lev_dist', 'sw1_blend_lev_dist', 'sw2_blend_lev_dist', 'sw1_sw2_lev_dist', 'sw1_graphemes', 'sw2_graphemes', 'sw1_syllables', 'sw2_syllables', 'sw1_len', 'sw2_len', 'sw1_contrib', 'sw2_contrib', 'sw1_sw2_removal', 'sw1_aff_c', 'sw1_N_c', 'sw2_aff_c', 'sw2_N_c', 'sp1', 'sp2', 'sp3', 'LABEL', 'BLEND', 'CW1', 'CW2', 'CW1_split', 'CW2_split' ] csvf = open('overlap_splitp_040918.csv'.format(lexicon), '+w', newline='') csvw = csv.DictWriter(csvf, delimiter=',', fieldnames=col_names) T, F = 0, 0 dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle' with open(dataf, 'rb') as f: freqd = pickle.load(f) # overlap candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/' # noverlap #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/' for i, filename in enumerate(listdir(candidate_folder)): blend = filename.split('_')[0] print('#', i, 'reading', blend) with open(candidate_folder + filename) as f: for ln in f: cw1, cw2 = ln.rstrip().split(',') sw1, sw2 = gold_blends[blend] #print('### blend:', blend, 'gold:', (sw1, sw2), 'sample:', (cw1, cw2)) feature_set = extract_sample_features(blend, cw1, cw2, lexicon, corpus, sw1, sw2, freqd, wsm, csm, epit) for features, label in feature_set: #entry = list(map(lambda x: str(x), features.values())) csvw.writerow(features) csvf.close()