예제 #1
0
def write_features_to_csv():
    lexicon = 'saldo'
    corpus = 'news'
    gold_blends = blend_keys()

    csvf = open(f'{lexicon}_features_noverlap_blends_min1_samplewords.csv',
                '+w',
                newline='')
    csvw = csv.writer(csvf, delimiter=',')

    T, F = 0, 0

    candidate_folder = f'/home/adam/Documents/lexical_blends_project/{lexicon}_blends_candidates_noverlap_1/'

    for i, filename in enumerate(listdir(candidate_folder)):
        blend = filename.split('_')[0]
        print('### reading blend:', i, blend)
        with open(candidate_folder + filename) as f:
            for ln in f:
                cw1, cw2 = ln.rstrip().split(',')
                sw1, sw2 = gold_blends[blend]

                feature_set, label = extract_sample_features(
                    blend, cw1, cw2, lexicon, corpus, sw1, sw2)
                entry = list(map(lambda x: str(x), feature_set.values()))

                if label == True:
                    T += 1
                else:
                    F += 1

                csvw.writerow(entry)
        print(blend, T, F)

    csvf.close()
예제 #2
0
def multip_write_features_to_csv():
    lexicon = 'saldo'
    corpus = 'news'
    gold_blends = blend_keys()

    wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1'
    wsm = gs.models.Word2Vec.load(wg_path)
    cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext'
    csm = gs.models.Word2Vec.load(cg_path)
    epit = epitran.Epitran('swe-Latn')

    csvf = open('{0}_features_overlap_split_020818.csv'.format(lexicon),
                '+w',
                newline='')
    csvw = csv.writer(csvf, delimiter=',')

    T, F = 0, 0

    dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle'

    with open(dataf, 'rb') as f:
        freqd = pickle.load(f)

    # overlap
    candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/'
    # noverlap
    #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/'

    cand_set = []

    for i, filename in enumerate(listdir(candidate_folder)):
        blend = filename.split('_')[0]
        #print('#', i ,'reading', blend, 'from', candidate_folder+filename)
        with open(candidate_folder + filename) as f:
            for ln in f:
                cw1, cw2 = ln.rstrip().split(',')
                if blend in [cw1, cw2]:
                    continue
                sw1, sw2 = gold_blends[blend]
                cand_set.append((blend, cw1, cw2, lexicon, corpus, sw1, sw2,
                                 freqd, csm, wsm, epit))

    for cand_chunk in chunks(cand_set, 10):
        with Pool(3) as p:
            entires = p.starmap(extract_sample_features, cand_chunk)
            print('# writing entries')
            for entry in entires:
                for e in entry:
                    csvw.writerow(list(map(lambda x: str(x), e[0].values())))

    csvf.close()
예제 #3
0
def multip_write_features_to_csv():
    lexicon = 'saldo'
    corpus = 'news'
    gold_blends = blend_keys()

    csvf = open(
        '{0}_features_overlap_split_blends_charsim_280718.csv'.format(lexicon),
        '+w',
        newline='')
    csvw = csv.writer(csvf, delimiter=',')

    T, F = 0, 0

    dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle'

    with open(dataf, 'rb') as f:
        freqd = pickle.load(f)

    #candidate_folder = f'/home/adam/Documents/lexical_blends_project/{lexicon}_blend_candidates_1/'
    candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/'

    cand_set = []

    for i, filename in enumerate(listdir(candidate_folder)):
        blend = filename.split('_')[0]
        #print('#', i ,'reading', blend, 'from', candidate_folder+filename)
        with open(candidate_folder + filename) as f:
            for ln in f:
                cw1, cw2 = ln.rstrip().split(',')
                if blend in [cw1, cw2]:
                    continue
                sw1, sw2 = gold_blends[blend]
                cand_set.append(
                    (blend, cw1, cw2, lexicon, corpus, sw1, sw2, freqd))

    for cand_chunk in chunks(cand_set, 10):
        with Pool(3) as p:
            entires = p.starmap(extract_sample_features, cand_chunk)
            print('# writing entries')
            for entry in entires:
                for e in entry:
                    csvw.writerow(list(map(lambda x: str(x), e[0].values())))

    csvf.close()
예제 #4
0
def write_features_to_csv():
    lexicon = 'saldo'
    corpus = 'news'
    gold_blends = blend_keys()

    csvf = open(f'{lexicon}_features_overlap_blends_min1.csv',
                '+w',
                newline='')
    csvw = csv.writer(csvf, delimiter=',')

    T, F = 0, 0

    dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle'

    with open(dataf, 'rb') as f:
        freqd = pickle.load(f)

    candidate_folder = f'/home/adam/Documents/lexical_blends_project/{lexicon}_blend_candidates_1/'

    for i, filename in enumerate(listdir(candidate_folder)):
        blend = filename.split('_')[0]
        print('#', i, 'reading', blend, 'from', candidate_folder + filename)
        with open(candidate_folder + filename) as f:

            for ln in f:
                cw1, cw2 = ln.rstrip().split(',')
                sw1, sw2 = gold_blends[blend]

                #print('### blend:', blend, 'gold:', (sw1, sw2), 'sample:', (cw1, cw2))
                feature_set = extract_sample_features(blend, cw1, cw2, lexicon,
                                                      corpus, sw1, sw2, freqd)
                for features, label in feature_set:
                    if not features:
                        continue
                    if label:
                        T += 1
                    else:
                        F += 1

                    entry = list(map(lambda x: str(x), features.values()))
                    csvw.writerow(entry)
        print(blend, T, F)

    csvf.close()
예제 #5
0
def write_features_to_csv():
    lexicon = 'saldo'
    corpus = 'news'
    gold_blends = blend_keys()

    wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1'
    wsm = gs.models.Word2Vec.load(wg_path)
    cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/cc.sv.300.bin'
    csm = FastText.load_fasttext_format(cg_path)
    #cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext'
    #csm = FastText.load(cg_path)
    epit = epitran.Epitran('swe-Latn')

    col_names = [
        'sw1_charemb_score', 'sw2_charemb_score', 'blend_charemb_score',
        'sw1_sw2_charemb_sim', 'sw1_blend_charemb_sim',
        'sw2_blend_charemb_sim', 'sw1_wordemb_score', 'sw2_wordemb_score',
        'blend_wordemb_score', 'sw1_blend_wordemb_sim',
        'sw2_blend_wordemb_sim', 'sw1_sw2_wordemb_sim', 'splits',
        'sw1_sw2_char_bigramsim', 'sw2_sw1_char_bigramsim',
        'sw1_sw2_char_trigramsim', 'sw2_sw1_char_trigramsim', 'lcs_sw1_sw2',
        'sw1_blend_IPA_lev_dist', 'sw2_blend_IPA_lev_dist',
        'sw1_sw2_IPA_lev_dist', 'sw1_blend_lev_dist', 'sw2_blend_lev_dist',
        'sw1_sw2_lev_dist', 'sw1_graphemes', 'sw2_graphemes', 'sw1_syllables',
        'sw2_syllables', 'sw1_len', 'sw2_len', 'sw1_contrib', 'sw2_contrib',
        'sw1_sw2_removal', 'sw1_aff_c', 'sw1_N_c', 'sw2_aff_c', 'sw2_N_c',
        'sp1', 'sp2', 'sp3', 'LABEL', 'BLEND', 'CW1', 'CW2', 'CW1_split',
        'CW2_split'
    ]

    csvf = open('overlap_splitp_040918.csv'.format(lexicon), '+w', newline='')
    csvw = csv.DictWriter(csvf, delimiter=',', fieldnames=col_names)

    T, F = 0, 0

    dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle'

    with open(dataf, 'rb') as f:
        freqd = pickle.load(f)

    # overlap
    candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/'
    # noverlap
    #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/'

    for i, filename in enumerate(listdir(candidate_folder)):
        blend = filename.split('_')[0]
        print('#', i, 'reading', blend)
        with open(candidate_folder + filename) as f:

            for ln in f:
                cw1, cw2 = ln.rstrip().split(',')
                sw1, sw2 = gold_blends[blend]

                #print('### blend:', blend, 'gold:', (sw1, sw2), 'sample:', (cw1, cw2))
                feature_set = extract_sample_features(blend, cw1, cw2, lexicon,
                                                      corpus, sw1, sw2, freqd,
                                                      wsm, csm, epit)
                for features, label in feature_set:
                    #entry = list(map(lambda x: str(x), features.values()))
                    csvw.writerow(features)

    csvf.close()