Beispiel #1
0
def main():
    args = argparse.ArgumentParser()
    args.add_argument(
        'source',
        help='source language word vector file (.magnitude)',
    )
    args.add_argument(
        'target',
        help='target language word vector file (.magnitude)',
    )
    args.add_argument(
        '-n',
        dest='count',
        type=int,
        default=10,
        help='number of neighbors per word',
    )
    argv = args.parse_args()
    src = mag.Magnitude(argv.source)
    tgt = mag.Magnitude(argv.target)
    for word in sys.stdin:
        word = word.rstrip()
        v = src.query(word)
        neighbors = tgt.most_similar(v)
        show(word, neighbors, argv.count)
Beispiel #2
0
    def __init__(self):
        self.q_and_as = [{
            'question': '''Following a C3-C7 laminoplasty in a myelopathic
             patient with cervical stenosis, the most common neurologic
             complication would manifest with which of the following new
             postoperative exam findings?''',
            'keywords': [['bicep'], ['weakness']],
            'answer': 'Bicep weakness'
        }, {
            'question':
            '''Which variables has the strongest
            association with poor clinical outcomes in patients who undergo
            expansive laminoplasty for cervical spondylotic myelopathy?''',
            'keywords': [['angle'], ['small', 'few', '13', 'degrees'],
                         ['kyphosis']],
            'answer':
            'Local kyphosis angle > 13 degrees'
        }, {
            'question': '''Which classification system for cervical myelopathy
            focuses exclusively on lower extremity function?''',
            'keywords': [['nurick']],
            'answer': 'Nurick'
        }, {
            'question':
            '''Motor-dominant radiculopathy with weakness of the
            deltoid''',
            'keywords': [['motor'], ['radiculopathy'], ['weak'], ['deltoid']]
        }]

        self.q_index = 0
        self.wv = pymagnitude.Magnitude(
            '../wiki-news-300d-1M-subword.magnitude')
def create_doodle_vocab(doodle_class_path,
                        w2v_magnitdue_path,
                        wordmap_path,
                        out_doodle_path,
                        topn=10):
    # Read W2v retrofitted
    wv = pymagnitude.Magnitude(w2v_magnitdue_path)

    # read doodle
    with open(doodle_class_path, 'r') as j:
        doodle = json.load(j)

    # Build doodle+ list X 10
    doodle_plus = []
    for d in doodle:
        doodle_plus.append(d)
        doodle_plus += set(
            [k[0].lower() for k in wv.most_similar(d, topn=topn)])

    # Read wordmap
    with open(wordmap_path, 'r') as j:
        word_map = json.load(j)
    rev_word_map = {v: k for k, v in word_map.items()}  # ix2word

    # find the intersection with vocab
    doodle_map = {
        d: word_map.get(d)
        for d in doodle_plus if word_map.get(d) is not None
    }

    with open(out_doodle_path, 'w') as j:
        json.dump(doodle_map, j)
Beispiel #4
0
def sketch2caption(doodle_class,
                   checkpoint,
                   word_map_path,
                   w2v_magnitdue_path,
                   beam_size=1,
                   num_sen=5):
    # read wordmap
    with open(word_map_path, 'r') as j:
        word_map = json.load(j)
    rev_word_map = {v: k for k, v in word_map.items()}  # ix2word

    # read w2v
    w2v = pymagnitude.Magnitude(w2v_magnitdue_path)

    # Load model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    checkpoint = torch.load(checkpoint)
    decoder = checkpoint['decoder']
    decoder = decoder.to(device)
    decoder.eval()

    keyword_size = checkpoint['keyword_size']
    failure = 0

    # generate keywords
    key_candidates = [
        w[0].lower()
        for w in w2v.most_similar(doodle_class, topn=keyword_size * 10)
    ]
    key_candidates = [w for w in key_candidates if word_map.get(w) is not None]
    # if len(key_candidates) < keyword_size:
    #     failure = 100

    # Encode, decode with attention and beam search
    sentences = []
    keys = []
    sent_count = 0
    while (sent_count < num_sen):
        random.shuffle(key_candidates)
        key = [doodle_class]
        key += key_candidates[:keyword_size - 1]
        if len(key) < keyword_size:
            key = [doodle_class] * keyword_size
        seq = caption_beam_search(decoder, key, word_map, beam_size)
        unk_count = [s for s in seq if s in {word_map['<unk>']}]
        if len(seq) == 0:  # or len(unk_count) > 0:
            # print('Caption is not generated on ', key)
            failure += 1
            if failure > num_sen * 10:
                break
            else:
                continue
        sentences.append([
            rev_word_map.get(s) for s in seq if s not in
            {word_map['<start>'], word_map['<end>'], word_map['<pad>']}
        ])
        keys.append(key)
        sent_count += 1

    return keys, sentences, failure > 0
Beispiel #5
0
def main(opts):
    
    wvecs = pymagnitude.Magnitude(opts.wordvecfile)
    dim = wvecs.dim
    word_vecs = {}
    
    for k in tqdm.trange(len(wvecs)):
        word_vecs[wvecs[k][0]] = wvecs[k][1]

    alpha = 0.075
    ontology = read_lexicon(opts.lexicon)

    for t in tqdm.trange(10):
        for i in range(len(wvecs)):
            
            wordvec = wvecs[i]		
            sums = edge_vectors(word_vecs, wordvec[0], ontology, dim)
            word_vecs[wordvec[0]] = (sums[0] + alpha*wordvec[1]) / (sums[1] + alpha)

    f = open(opts.output, 'w', encoding = 'utf-8')
    
    for k,v in word_vecs.items():
        f.write(str(k))
        for x in v:
            f.write(' ' + str(x))
        f.write('\n')

    f.close()
 def readMagnitude(self, wv_magnitude_file):
     wv = pymagnitude.Magnitude(wv_magnitude_file)
     word_vectors = {}
     for key, vectors in wv:
         word_vectors[key] = np.zeros(len(vectors))
         for index, vector in enumerate(vectors):
             word_vectors[key][index] = vector
     return word_vectors
Beispiel #7
0
def create_res_fr_en(eng_mag, fr_mag, b_dict, t_dict, output_f):
    print("first")
    eng_vectors = py.Magnitude(eng_mag)
    fr_vectors = py.Magnitude(fr_mag)

    # we create two dictionaries from the dict data
    # one goes en->fr and the other fr->en
    # this is the training set
    data_dict_en_to_fr = {}
    data_dict_fr_to_en = {}
    with open(b_dict) as f:
        for line in f:
            pair = line.split(" ")
            pair[1] = pair[1][:-1]
            data_dict_en_to_fr[pair[0]] = pair[1]
            data_dict_fr_to_en[pair[1]] = pair[0]

    en_mat = []
    fr_mat = []
    for key in data_dict_fr_to_en.keys():
        en = eng_vectors.query(data_dict_fr_to_en[key])
        fr = fr_vectors.query(key)
        en_mat.append(en)
        fr_mat.append(fr)
    en_mat = np.array(en_mat)
    fr_mat = np.array(fr_mat)

    u, sig, vt = np.linalg.svd(np.matmul(en_mat.transpose(), fr_mat))

    W = np.matmul(np.transpose(vt), np.transpose(u))

    final = []
    with open(t_dict) as f:
        for i, line in enumerate(f):
            line = line[:-1]
            pair = line.split(" ")
            word = eng_vectors.most_similar(np.matmul(fr_vectors.query(pair[1]), W), topn=5)#[0][0]
            to_add = ""
            #print(i)
            for w in word:
                to_add += (" " + w[0])
            line = pair[1] + " " + pair[0] + " " + to_add[1:]
            final.append(line)
            if i % 100==0: print(i)

    np.savetxt(output_f, final, fmt="%s")
def create_res_fr_en(eng_mag, fr_mag, b_dict, t_dict, output_f):
    print("first")
    eng_vectors = py.Magnitude(eng_mag)
    fr_vectors = py.Magnitude(fr_mag)

    some_keys = []
    i = 0
    for key, vector in eng_vectors:
        if key in fr_vectors:
            some_keys.append(key)
        i+=1

    data_dict_fr_to_en = {}
    for line in some_keys:
        data_dict_fr_to_en[line] = line

    en_mat = []
    fr_mat = []
    for key in data_dict_fr_to_en.keys():
        en = eng_vectors.query(data_dict_fr_to_en[key])
        fr = fr_vectors.query(key)
        en_mat.append(en)
        fr_mat.append(fr)
    en_mat = np.array(en_mat)
    fr_mat = np.array(fr_mat)

    u, sig, vt = np.linalg.svd(np.matmul(en_mat.transpose(), fr_mat))

    W = np.matmul(np.transpose(vt), np.transpose(u))

    final = []
    with open(t_dict) as f:
        for i, line in enumerate(f):
            line = line[:-1]
            pair = line.split(" ")
            word = eng_vectors.most_similar(np.matmul(fr_vectors.query(pair[1]), W), topn=5)#[0][0]
            to_add = ""
            #print(i)
            for w in word:
                to_add += (" " + w[0])
            line = pair[1] + " " + pair[0] + " " + to_add[1:]
            final.append(line)
            if i % 100==0: print(i)

    np.savetxt(output_f, final, fmt="%s")
Beispiel #9
0
 def __init__(self,
              retrofitted_magnitude,
              wvec_file,
              retrofitted_vector,
              topn=10):
     self.retrofitted_magnitude = pymagnitude.Magnitude(
         retrofitted_magnitude
     )  # This is the Q_hat vector  100 dimenstional GloVe word vectors
     self.topn = topn
     self.wvecs = wvec_file
     self.wvecKey = set(self.wvecs.keys())
     self.retrofitted_vector = retrofitted_vector
Beispiel #10
0
    def load_pymagnitude_model(self, given_model_name=None, language=None):
        '''load models; simple wrapper'''

        t0 = time()

        # ugly but from tut:
        import pymagnitude

        print("loading pymagnitude model {} ...".format(given_model_name))
        if language is None:
            self.embedding_model = pymagnitude.Magnitude(given_model_name)
        else:
            self.embedding_model = pymagnitude.Magnitude(given_model_name, language=language)
        print("... done in %0.3fs." % (time() - t0))

        print("initializing for most_similar-searches...")
        t0 = time()
        print(self.embedding_model.most_similar(positive=["test"]))
        print("... done in %0.3fs." % (time() - t0))

        return
Beispiel #11
0
    def __init__(self,
                 filepath='source/wiki-news-300d-1M-subword.magnitude',
                 dimensions=300):
        """
        Load the pretrained Embeddings

        :param string filename: Path to pymagnitude file as *.magnitude
        :param int dimensions: Dimensions of the Vectors (to generate zeros for padding)
        """

        self.dimensions = dimensions
        self.filepath = filepath
        self.vectors = magnitude.Magnitude(filepath)
Beispiel #12
0
def main(opts):
    
    wvecs = pymagnitude.Magnitude(opts.wordvecfile)
    dim = wvecs.dim
    word_vecs = {}
    
    #copy the read-only word vectors in a dictionary for modification
    for k in tqdm.trange(len(wvecs)):
        word_vecs[wvecs[k][0]] = wvecs[k][1]
    
    alpha = 0.075
    #normalizing the lexicon to all lower case letters
    ontology = read_lexicon(opts.lexicon)
    
    for t in tqdm.trange(10):
        for i in range(len(wvecs)):
            
            wordvec = wvecs[i]	
            #calculate the retrofitted matrix by increasing the similarities between the synonyms as per the lexicon file
            sums = edge_vectors(word_vecs, wordvec[0], ontology, dim)
            word_vecs[wordvec[0]] = (sums[0] + alpha*wordvec[1]) / (sums[1] + alpha)

    f = open(opts.output, 'w', encoding = 'utf-8')
    
    #write the retrofitted word vectors in output file
    for k,v in word_vecs.items():
        f.write(str(k)
        for x in v:
            f.write(' ' + str(x))
        f.write('\n')

    f.close()


if __name__ == '__main__':
    optparser = optparse.OptionParser()
    optparser.add_option("-w", "--wordvecfile", dest="wordvecfile", default=os.path.join('data', 'glove.6B.100d.magnitude'), help="word vectors file")
    optparser.add_option("-l", "--lexicon", dest="lexicon", default=os.path.join('data', 'lexicons', 'wordnet-synonyms.txt'), help="lexicon path")
    optparser.add_option("-o", "--output", dest="output", default=os.path.join('data', 'glove.6B.100d.retrofit.txt'), help="output txt file path")
    (opts, _) = optparser.parse_args()

    main(opts)
Beispiel #13
0
 def __init__(self, vectors_path: str, scaling: float = 1.0) -> None:
     self.vectors = pymagnitude.Magnitude(vectors_path, normalized=False)
     self.scaling = scaling
def topic_extractor(data_df, type_of_extraction):
    """Topic extractor function extracts topics from a csv file entered in file path.
    Depending on Type it can extract topics of individual papers or the whole session"""
    def topic_params_object(topics, words, vectors_300):
        #normalization parameters
        start_range = 1
        end_range = 10
        topic_parameters = []
        topics_ids = topics.columns.values.tolist()
        topic_weight = 0
        for each_topic in topics:
            topic_average_vector = []
            this_topic = {
                'topic_id': each_topic,
                'vector': [],
                'vector300': [],
                'words': [],
                'weight': 0
            }
            this_topic['words'] = topics[each_topic].dropna().values.tolist()
            for word in this_topic['words']:
                if word[len(word) - 1] == '*':
                    this_topic['vector'].append(words[words['word*'] == word]
                                                ['vector'].values.tolist()[0])
                    this_topic['weight'] = this_topic['weight'] + words[
                        words['word*'] == word]['sigma_nor'].values.tolist()[0]
                    this_topic['vector300'].append(
                        words[words['word*'] ==
                              word]['vector300'].values.tolist()[0])
                    # pdb.set_trace()
            this_topic['vector'] = numpy.mean(this_topic['vector'], axis=0)
            this_topic['vector300'] = numpy.mean(this_topic['vector300'],
                                                 axis=0)
            topic_parameters.append(this_topic)
        #Normalize values between 0-1
        df_topic_parameters = pd.DataFrame(topic_parameters)
        df_topic_parameters_weight = df_topic_parameters['weight']
        df_topic_parameters['weight'] = (end_range - start_range) * (
            df_topic_parameters_weight - df_topic_parameters_weight.min()) / (
                df_topic_parameters_weight.max() -
                df_topic_parameters_weight.min()) + start_range
        # pdb.set_trace()
        return df_topic_parameters

    #SETUP LANGUAGE MODEL AND PIPELINE VARIABLES####
    lang = 'en'
    language_model = {
        'en': './classes/nsaSrc/data/external/wiki-news-300d-1M.magnitude'
    }
    if type_of_extraction == 'session':
        percentile_C = 95
    else:
        percentile_C = 80
    target_dim = 10
    cluster_selection_method = 'leaf'

    #We only get the ones that have text
    data_df = data_df[data_df['text'] != 'Parsing Error']

    def en_filter(text):
        spacey_doc = nlp(text)
        # pdb.set_trace()
        sentences = []
        for sentence in spacey_doc.sents:
            for token in sentence:
                if not token.__len__(
                ) < 4 and not token.is_stop and not token.like_num and not token.is_digit:
                    sentences.append(str(token))
                # else:
                #     print(token)
        return sentences

    # data_subset = [record for record in data if record['Text'] != False and 'sagepub' in record['Text'].lower()]

    # data_subset = data_df
    text_fn = {
        'en': './classes/nsaSrc/data/processed/en_flat.txt',
        'ko': './classes/nsaSrc/data/processed/ko_flat.txt'
    }

    # THIS CODE CREATES THE DATA FOR PROCESSING AND STORES IT IN ./vizlit/data/processed AS A FLAT TEXT FILE ##########
    #
    #
    if type_of_extraction == 'session':
        with open(text_fn[lang], 'w', encoding='utf-8') as fp:
            for record_text in data_df['text']:
                # if record['Text'] != False:
                sentences = en_filter(record_text)
                for s in sentences:
                    fp.write(s + '\n')
    elif type_of_extraction == 'document':
        with open(text_fn[lang], 'w', encoding='utf-8') as fp:
            # for x in range(2):
            for record_text in data_df['text']:
                # if record['Text'] != False:
                sentences = en_filter(
                    record_text)  #needs to be an array to work
                for sent in sentences:
                    fp.write(sent + '\n')

    ##########################

    ####################################

    ##FIND SIGNIGICANT TERMS IN CORPUS #########
    word_level_statistics = WordLevelStatistics(corpus_file=[text_fn[lang]],
                                                percentile_C=percentile_C)
    word_level_statistics.compute_spectra()
    full_collection = pd.DataFrame(word_level_statistics.level_stat)
    lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded)
    lvls_df['threshold'] = word_level_statistics.threshold
    # pdb.set_trace()
    #Minimize corpus to most important words
    significant_terms = word_level_statistics.significant_terms

    #SOMETHING BROKE HERE FOR SOME REASON LVLS_DF IS ONE VALUE BIGGER THAN IT SHOULD AFTER FILTERING
    # if type_of_extraction == 'session':
    #Remove numbers and short words
    spacey_significant_terms = nlp(' '.join(
        word_level_statistics.significant_terms))
    significant_terms = significant_terms
    significant_terms = []
    for sentence in spacey_significant_terms.sents:
        for token in sentence:
            if not token.__len__(
            ) < 4 and not token.is_stop and not token.like_num and not token.is_digit:
                significant_terms.append(str(token))
            # else:
            #     # pdb.set_trace()
            #     # print (token)
            #     #Remove token from dataframe
            #     # pdb.set_trace()
            #     lvls_df = lvls_df[lvls_df.word != str(token)]
    lvls_df_filtered = pd.DataFrame()
    for each_word in significant_terms:
        lvls_df_filtered = lvls_df_filtered.append(
            lvls_df[lvls_df.word == each_word])
    lvls_df = lvls_df_filtered
    # # print('With threshold = {}, ({} percentile) find {} significant terms.'.format(
    # #     word_level_statistics.threshold, word_level_statistics.percentile_C, len(significant_terms)))

    ##CLUSTER WORD EMBEDDINGS
    vectors = {}
    for l in ['en']:
        vectors[l] = pymagnitude.Magnitude(language_model[l])
    significant_vectors = vectors[lang].query(significant_terms)

    try:
        fit = umap.UMAP(n_neighbors=15,
                        n_components=target_dim,
                        metric='euclidean')
        data_d2v = fit.fit_transform(
            significant_vectors
        )  #np.asfarray(significant_vectors, dtype='float64' ))
        if type_of_extraction == 'session':
            #store model
            joblib.dump(fit, model_file_name)
        fit = umap.UMAP(n_neighbors=15, n_components=2, metric='euclidean')
        vec_2d = fit.fit_transform(data_d2v)
    except Exception as ex:
        pdb.set_trace()
        logging.error(
            "Trying with less dimensions. Got exception {}".format(ex))
        # data_d2v = bhtsne.tsne(np.asfarray(significant_vectors, dtype='float64' ),dimensions=2)
        # vec_2d = data_d2v
        #Try again with less neighbors this is just a TEMPORAL FIX
        fit = umap.UMAP(n_neighbors=7,
                        n_components=target_dim,
                        metric='euclidean')
        data_d2v = fit.fit_transform(significant_vectors)
        fit = umap.UMAP(n_neighbors=7, n_components=2, metric='euclidean')
        vec_2d = fit.fit_transform(data_d2v)
    try:
        lvls_df['vector'] = [v for v in data_d2v]
    except ValueError:
        pdb.set_trace()
        print('Error')
    lvls_df['vector300'] = [v for v in significant_vectors]
    significant_terms_enriched = enrich_significant_terms(
        lvls_df, data_d2v, vec_2d, cluster_selection_method)

    topics, top_columns = display_topics(significant_terms_enriched,
                                         n_rows=25,
                                         n_cols=250)
    # topics,top_columns = display_topics(significant_terms_enriched,n_rows=25,n_cols=10)#testing with 10 topics
    print('{} topics'.format(significant_terms_enriched['topic'].max() + 1))
    print('/n')
    print(topics)
    topic_params = topic_params_object(topics, lvls_df, significant_vectors)
    return {'topics': topics, 'lvls_df': lvls_df, 'topic_params': topic_params}
Beispiel #15
0
 def __init__(self, fname):
     self._vectors = pymagnitude.Magnitude(fname, lazy_loading=-1, blocking=True)
def create_input_embeddings(base_name,
                            keyword_size,
                            caption_json_path,
                            doodle_json_path,
                            w2v_magnitdue_path,
                            min_word_freq=5,
                            max_len=50):
    dataset_name = 'coco_' + base_name
    output_folder = os.path.join('data', base_name)
    captions_per_image = 5

    # Read Karpathy JSON
    with open(caption_json_path, 'r') as j:
        data = json.load(j)

    # Read doodle JSON
    with open(doodle_json_path, 'r') as j:
        doodle = json.load(j)

    # Read w2v
    w2v = pymagnitude.Magnitude(w2v_magnitdue_path)

    # Read image paths and captions for each image
    train_keywords = []
    train_image_captions = []
    val_keywords = []
    val_image_captions = []
    test_keywords = []
    test_image_captions = []
    word_freq = Counter()
    exclude_count = 0
    total_count = 0
    for img in data['images']:
        captions = []
        key_freq = Counter()
        for c in img['sentences']:
            # Update word frequency
            word_freq.update(c['tokens'])
            if len(c['tokens']) <= max_len:
                captions.append(c['tokens'])
                keys = [t for t in c['tokens'] if doodle.get(t) is not None]
                if len(keys) > 0:
                    key_freq.update(keys)

        if len(captions) == 0 or len(key_freq) == 0:
            print("----------> no key match: ", captions[-1])
            continue

        keywords = [nn for nn, c in key_freq.most_common(keyword_size)]
        total_count += 1
        if len(keywords) < keyword_size:
            # print(keywords)
            exclude_count += 1
            continue

        if img['split'] in {'train', 'restval'}:
            train_keywords.append(keywords)
            train_image_captions.append(captions)

        elif img['split'] in {'val'}:
            val_keywords.append(keywords)
            val_image_captions.append(captions)

        elif img['split'] in {'test'}:
            test_keywords.append(keywords)
            test_image_captions.append(captions)

    # Create word map
    words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    word_map = {k: v + 1 for v, k in enumerate(words)}
    word_map['<unk>'] = len(word_map) + 1
    word_map['<start>'] = len(word_map) + 1
    word_map['<end>'] = len(word_map) + 1
    word_map['<pad>'] = 0

    # Create a base/root name for all output files
    base_filename = dataset_name + '_' + str(
        keyword_size
    )  # + str(captions_per_image) + '_cap_per_img_' + str(min_word_freq) + '_min_word_freq'

    # Save word map to a JSON
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)
    with open(
            os.path.join(output_folder, 'WORDMAP_' + base_filename + '.json'),
            'w') as j:
        json.dump(word_map, j)

    print("{} of {} will be excluded".format(exclude_count, total_count))

    # Sample captions for each image, save images to HDF5 file, and captions and their lengths to JSON files
    seed(123)
    for keys, imcaps, split in [(train_keywords, train_image_captions,
                                 'TRAIN'),
                                (val_keywords, val_image_captions, 'VAL'),
                                (test_keywords, test_image_captions, 'TEST')]:

        with h5py.File(
                os.path.join(output_folder,
                             split + '_IMAGES_' + base_filename + '.hdf5'),
                'a') as h:
            # Make a note of the number of captions we are sampling per image
            h.attrs['captions_per_image'] = captions_per_image
            print("\nReading %s images and captions, storing to file...\n" %
                  split)

            enc_keywords = []
            enc_captions = []
            caplens = []
            for i in range(len(imcaps)):

                # Sample captions
                if len(imcaps[i]) < captions_per_image:
                    captions = imcaps[i] + [
                        choice(imcaps[i])
                        for _ in range(captions_per_image - len(imcaps[i]))
                    ]
                else:
                    captions = sample(imcaps[i], k=captions_per_image)

                # Sanity check
                assert len(captions) == captions_per_image

                for j, c in enumerate(captions):
                    if keys[i] in c:
                        print("found", keys[i], c)

                    # Encode keywords
                    enc_k = [
                        word_map.get(key, word_map['<unk>']) for key in keys[i]
                    ]

                    # Encode captions
                    enc_c = [word_map['<start>']] + [
                        word_map.get(word, word_map['<unk>']) for word in c
                    ] + [word_map['<end>']
                         ] + [word_map['<pad>']] * (max_len - len(c))

                    # Find caption lengths
                    c_len = len(c) + 2

                    enc_keywords.append(enc_k)
                    enc_captions.append(enc_c)
                    caplens.append(c_len)

            # Sanity check
            assert len(enc_keywords) == len(enc_captions) == len(caplens)

            # Save encoded captions and their lengths to JSON files
            with open(
                    os.path.join(
                        output_folder,
                        split + '_KEYWORDS_' + base_filename + '.json'),
                    'w') as j:
                json.dump(enc_keywords, j)

            with open(
                    os.path.join(
                        output_folder,
                        split + '_CAPTIONS_' + base_filename + '.json'),
                    'w') as j:
                json.dump(enc_captions, j)

            with open(
                    os.path.join(output_folder, split + '_CAPLENS_' +
                                 base_filename + '.json'), 'w') as j:
                json.dump(caplens, j)
Beispiel #17
0
    # ppdb-xl
    # alpha = 1
    # beta = 1.0185
    # dev.out score: 44.9207

    # optparser.add_option("-r", action="context_word", dest="context_word", default=False)
    (opts, _) = optparser.parse_args()

    if opts.logfile is not None:
        logging.basicConfig(filename=opts.logfile,
                            filemode='w',
                            level=logging.DEBUG)

    retrain = False
    word_vector = load_wvecs(pymagnitude.Magnitude(opts.wordvecfile))
    new_retrofitted_magnitude = os.path.join(
        'data', 'glove.6B.100d.retrofit.magnitude')
    if retrain:
        new_retrofitted_txt = os.path.join('data',
                                           'glove.6B.100d.retrofit.txt')

        lexicon = load_lexicon(opts.lexiconfile)
        retrofitted_vector = retrofitting(word_vector, lexicon, opts.iteration,
                                          opts.alpha, opts.beta)
        # We need to do retrofitting here
        save_word_vecs(retrofitted_vector, new_retrofitted_txt)
        os.system("python3 -m pymagnitude.converter -i " +
                  new_retrofitted_txt + " -o " + new_retrofitted_magnitude)
    else:
        retrofitted_vector = load_wvecs(
def main(tmx_file, lang, percentile):
    """ Computes topic models by clustering dense word
        embeddings.
    """
    logger = logging.getLogger(__name__)
    logger.info('Compute topic model: {}, {}, {}'.format(
        tmx_file, lang, percentile))

    word_level_statistics = WordLevelStatistics(corpus_file=[text_fn[lang]],
                                                percentile_C=percentile)
    word_level_statistics.compute_spectra()

    lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded)
    significant_terms = word_level_statistics.significant_terms
    print('Threshold: {}, ({} percentile) find {} significant terms.'.format(
        word_level_statistics.threshold, word_level_statistics.percentile_C,
        len(significant_terms)))

    vectors = {}
    for language in ['en']:
        vectors[language] = pymagnitude.Magnitude(language_model[language])

    significant_vectors = vectors[lang].query(significant_terms)

    try:
        fit = umap.UMAP(n_neighbors=15,
                        n_components=target_dim,
                        metric='euclidean')
        data_d2v = fit.fit_transform(significant_vectors)
        fit = umap.UMAP(n_neighbors=15, n_components=2, metric='euclidean')
        vec_2d = fit.fit_transform(data_d2v)
    except Exception as ex:
        logging.error("Trying bhtsne. Got exception {}".format(ex))
        data_d2v = bhtsne.tsne(np.asfarray(significant_vectors,
                                           dtype='float64'),
                               dimensions=2)
        vec_2d = data_d2v

    lvls_df['vector'] = [v for v in data_d2v]

    significant_terms_enriched = enrich_significant_terms(
        lvls_df, data_d2v, vec_2d, cluster_selection_method)
    exemplar_scores, hovers = topic_exemplars(significant_terms_enriched)

    sents = [s['sentence'] for s in sentences]
    sent_ids = [s['sent_id'] for s in sentences]

    significant_terms_enriched['weight'] = significant_terms_enriched[
        'sigma_nor']

    msg_topics = message_topics(topic_model=significant_terms_enriched,
                                sentences=sents,
                                sentences_ids=sent_ids,
                                significant_terms=significant_terms)

    msg_topics_df = pd.DataFrame(msg_topics).fillna(0.0).T

    K = significant_terms_enriched['topic'].max() + 1
    topics, top_columns = display_topics(significant_terms_enriched,
                                         n_rows=25,
                                         n_cols=K)

    pwd = os.environ.get('PWD')
    fmt = '{}/models/{}_{}_{}.csv'
    significant_terms_file_name = fmt.format(pwd, 'significant_terms', lang,
                                             str(percentile))
    msg_topics_file_name = fmt.format(pwd, 'msg_topics', lang, str(percentile))
    data_filename_fmt = '{}/models/significant_vectors_{}_{}.npy'
    data_filename = data_filename_fmt.format(pwd, lang, percentile)

    significant_terms_enriched.to_csv(significant_terms_file_name,
                                      index=False,
                                      encoding='utf-8')
    msg_topics_df.to_csv(msg_topics_file_name, index=False, encoding='utf-8')
    np.save(data_filename, data_d2v)
def extension3_eng_fr(eng_mag, fr_mag, b_dict, t_dict, output_f):
    eng_vectors = py.Magnitude(eng_mag)
    fr_vectors = py.Magnitude(fr_mag)

    data_dict_en_to_fr = {}
    data_dict_fr_to_en = {}
    with open(b_dict) as f:
        for line in f:
            pair = line.split(" ")
            pair[1] = pair[1][:-1]
            data_dict_en_to_fr[pair[0]] = pair[0]
            data_dict_fr_to_en[pair[1]] = pair[1]

    en_mat = []
    fr_mat = []
    for key in data_dict_fr_to_en.keys():
        en = eng_vectors.query(
            data_dict_fr_to_en[key])  #vector of english word
        fr = fr_vectors.query(key)  #vector of french word
        en_mat.append(en)
        fr_mat.append(fr)

    en_mat = np.array(en_mat)
    fr_mat = np.array(fr_mat)

    u, sig, vt = np.linalg.svd(np.matmul(fr_mat.transpose(), en_mat))

    W = np.matmul(np.transpose(vt), np.transpose(u))

    mapped = np.matmul(en_mat, W)

    mat_avg = []
    for key in data_dict_fr_to_en.keys():
        en = eng_vectors.query(
            data_dict_fr_to_en[key])  #vector of english word
        fr = fr_vectors.query(key)  #vector of french word

        average = (np.matmul(en, W) + fr) / 2
        mat_avg.append(average)

    mat_avg = np.array(mat_avg)

    uu, sigsig, vtvt = np.linalg.svd(np.matmul(mat_avg.transpose(), mapped))

    WW = np.matmul(np.transpose(vtvt), np.transpose(uu))

    final_transform = np.matmul(W, WW)

    final = []
    with open(t_dict) as f:
        for i, line in enumerate(f):
            print(i)
            line = line[:-1]
            pair = line.split(" ")
            line = pair[0] + " " + pair[1]
            topn = fr_vectors.most_similar(np.matmul(
                eng_vectors.query(pair[0]), final_transform),
                                           topn=5)
            for j in range(5):
                word = topn[j][0]
                line = line + " " + word
            final.append(line)

    np.savetxt(output_f, final, fmt="%s")
 def __init__(self, wvec_file, topn=10,lexicon=None):
     self.wvecs = pymagnitude.Magnitude(wvec_file)
     # self.wvecfile = wvec_file
     self.topn = topn
     self.lexicon = lexicon
Beispiel #21
0
import numpy as np

from wiki import data

from wiki import utils

from musket_core.datasets import PredictionItem

import pymagnitude

m_path = "/Users/dreamflyer/Downloads/glove-lemmatized.6B.300d.magnitude"

vectors = pymagnitude.Magnitude(m_path)

none = vectors.query("none")


def convert_name(name):
    tokens = utils.stoa_1(name)

    result = []

    size = len(tokens)

    for i in range(size):
        result.append(' '.join(tokens[0:i + 1]))

    result.reverse()

    return result
Beispiel #22
0
 def __init__(self, wvec_file, topn=10):
     self.wvecs = pymagnitude.Magnitude(wvec_file)
     self.topn = topn
import pickle
import numpy as np
import pymagnitude

# load input data
X = pickle.load(open('X.pkl', 'rb'))

# load the pretrained word2vec model for feature assignment
pretrained_magnitude = r'../../../Downloads/pretrained/glove.6B.300d.magnitude'
vectors = pymagnitude.Magnitude(pretrained_magnitude)


# setup speciality cleaning
def get_document_features(data_in):
    """Used to clean 80k Mechanical Turk responses.

    Params:
        data_in --  text segment to process
    Returns:
        features for input text and features
    """
    data_in = data_in.replace('<span class=\"active_text\">',
                              '').replace('</span>', '')
    body = data_in.split(r'\n                                    ')[1].replace(
        '\n', '')
    avg_vec = np.mean(vectors.query(body.split(' ')), axis=(0))

    high_text = data_in.split(
        r'\n                                    ')[0].replace('\n', '')
    high_avg_vec = np.mean(vectors.query(high_text.split(' ')), axis=(0))
    return avg_vec, high_avg_vec
Beispiel #24
0
def sewing_space_en_fr(eng_mag, fr_mag, b_dict, t_dict, output_f, alpha=0.1):
    eng_vectors = py.Magnitude(eng_mag)
    fr_vectors = py.Magnitude(fr_mag)

    data_dict_en_to_fr = {}
    data_dict_fr_to_en = {}
    with open(b_dict) as f:
        for line in f:
            pair = line.split(" ")
            pair[1] = pair[1][:-1]
            data_dict_en_to_fr[pair[0]] = pair[0]
            data_dict_fr_to_en[pair[1]] = pair[1]

    #print('i get here')

    en_d = {}
    fr_d = {}

    for key in data_dict_fr_to_en.keys():
        en_d[data_dict_fr_to_en[key]] = eng_vectors.query(
            data_dict_fr_to_en[key])
        fr_d[key] = fr_vectors.query(key)

    for key in data_dict_fr_to_en.keys():
        en = en_d[data_dict_fr_to_en[key]]  #vector of english word
        fr = fr_d[key]  #vector of french word

        diff_vec_fr = en - fr
        diff_vec_en = fr - en

        en = en + diff_vec_en * alpha
        fr = fr + diff_vec_fr * alpha

        en_d[data_dict_fr_to_en[key]] = en
        fr_d[key] = fr

    en_mat = []
    fr_mat = []
    for key in data_dict_fr_to_en.keys():
        en = en_d[data_dict_fr_to_en[key]]
        fr = fr_d[key]

        en_mat.append(en)
        fr_mat.append(fr)

    en_mat = np.array(en_mat)
    fr_mat = np.array(fr_mat)

    u, sig, vt = np.linalg.svd(np.matmul(fr_mat.transpose(), en_mat))

    W = np.matmul(np.transpose(vt), np.transpose(u))

    final_transform = W  #np.matmul(W, WW)

    final = []
    with open(t_dict) as f:
        for i, line in enumerate(f):
            print(i)
            line = line[:-1]
            pair = line.split(" ")
            line = pair[0] + " " + pair[1]
            topn = fr_vectors.most_similar(np.matmul(
                eng_vectors.query(pair[0]), final_transform),
                                           topn=5)
            for j in range(5):
                word = topn[j][0]
                line = line + " " + word

            final.append(line)

    np.savetxt(output_f, final, fmt="%s")
Beispiel #25
0
def load_embedding_model(path: str) -> pymagnitude.Magnitude:
    # github.com/plasticityai/magnitude
    logging.info(f"loading embedding model from:\n {path}")
    vectors = pymagnitude.Magnitude(path=path)
    return vectors