def __init__( self, corpus_fname, corpus_dir_path='.', encoding='utf-8', language='english', need_preprocessing=False, limit=None ): """Constructor which initializes the BasePreprocessor constructor.""" try: self.logger = LOGGER except NameError: self.logger = log.initialise('T-Vecs.Preprocessor') self.language = language # If language is not specified, regex pattern for split is default '' self.lang_split_sent = defaultdict(lambda: u'') # Specify language specific split regex pattern lang_split_sent = [ ('hindi', u'[।]'), ] # Store language specific regex pattern in the defaultdict for k, v in lang_split_sent: self.lang_split_sent[k] = v self.logger.info('EmilleCorpusPreprocessor utilised') super(EmilleCorpusPreprocessor, self).__init__( corpus_fname, corpus_dir_path=corpus_dir_path, encoding=encoding, need_preprocessing=need_preprocessing, limit=limit )
def __init__(self, model_1, model_2, bilingual_dict, encoding='utf-8'): """Constructor initialization for the vector space mapper.""" try: self.logger = LOGGER except NameError: self.logger = log.initialise('T-Vecs.VectorSpaceMapper') self.model_1 = model_1 self.model_2 = model_2 self.encoding = encoding self.lt = None self.bilingual_dict = bilingual_dict bilingual_dict = dict(bilingual_dict) self.logger.debug('Extracting vocabulary and vector list from model 1') self.vector_1_list, self.word_1_list = self._extract_vectors_and_words( self.model_1, bilingual_dict.keys()) self.logger.debug('Extracting vocabulary and vector list from model 2') (self.vector_2_list, self.word_2_list) = VectorSpaceMapper._extract_vectors_and_words( self.model_2, bilingual_dict.values()) # Remove corresponding elements if any vectors were missing from models # across both languages (self.vector_1_list, self.word_1_list, self.vector_2_list, self.word_2_list) = zip( *[(self.vector_1_list[index], self.word_1_list[index], self.vector_2_list[index], self.word_2_list[index]) for index in range(len(self.vector_1_list)) if ((self.vector_1_list[index] is not None) and ( self.vector_2_list[index] is not None))])
def __init__(self, corpus_fname, corpus_dir_path='.', encoding='utf-8', need_preprocessing=False, language='english', limit=None): """Constructor which initializes the BasePreprocessor constructor.""" try: self.logger = LOGGER except NameError: self.logger = log.initialise('T-Vecs.Preprocessor') self.language = language # If language is not specified, regex pattern for split is default '' self.lang_split_sent = defaultdict(lambda: u'') # Specify language specific split regex pattern lang_split_sent = [ ('hindi', u'[।]'), ] # Store language specific regex pattern in the defaultdict for k, v in lang_split_sent: self.lang_split_sent[k] = v self.logger.info('HcCorpusPreprocessor utilised') super(HcCorpusPreprocessor, self).__init__(corpus_fname, corpus_dir_path=corpus_dir_path, encoding=encoding, need_preprocessing=need_preprocessing, limit=limit)
def __init__(self, model_1, model_2, bilingual_dict, encoding='utf-8'): """Constructor initialization for the vector space mapper.""" try: self.logger = LOGGER except NameError: self.logger = log.initialise('T-Vecs.VectorSpaceMapper') self.model_1 = model_1 self.model_2 = model_2 self.encoding = encoding self.lt = None self.bilingual_dict = bilingual_dict bilingual_dict = dict(bilingual_dict) self.logger.debug('Extracting vocabulary and vector list from model 1') self.vector_1_list, self.word_1_list = self._extract_vectors_and_words( self.model_1, bilingual_dict.keys() ) self.logger.debug('Extracting vocabulary and vector list from model 2') ( self.vector_2_list, self.word_2_list ) = VectorSpaceMapper._extract_vectors_and_words( self.model_2, bilingual_dict.values() ) # Remove corresponding elements if any vectors were missing from models # across both languages (self.vector_1_list, self.word_1_list, self.vector_2_list, self.word_2_list) = zip(*[ (self.vector_1_list[index], self.word_1_list[index], self.vector_2_list[index], self.word_2_list[index]) for index in range(len(self.vector_1_list)) if ( (self.vector_1_list[index] is not None) and ( self.vector_2_list[index] is not None) ) ] )
def __init__( self, corpus_fname, corpus_dir_path='.', encoding='utf-8', need_preprocessing=False, limit=None ): """Constructor initialization for BasePreprocessor.""" try: self.logger = LOGGER except NameError: self.logger = log.initialise('T-Vecs.Preprocessor') self.limit = limit self.corpus_fname = corpus_fname self.corpus_path = os.path.join( corpus_dir_path, self.corpus_fname ) self.encoding = encoding if need_preprocessing is True: self.preprocessed_corpus_path = os.path.join( corpus_dir_path, '%s.preprocessed' % corpus_fname ) if os.path.exists(self.preprocessed_corpus_path) is False: with codecs.open( self.corpus_path, 'r', encoding=self.encoding ) as file: self.logger.debug('Extracting Corpus Data') self._save_preprocessed_data( data=self._extract_corpus_data( data=file.read() ), output_fpath=self.preprocessed_corpus_path ) self.logger.debug('Saved Intermediate Preprocessed File') else: self.logger.info( 'Preprocessed Corpus found: %s.preprocessed', corpus_fname ) self.preprocessed_corpus_fname = '%s.preprocessed' % corpus_fname else: self.logger.info('Utilising Preprocessed Corpus: %s' % ( self.corpus_fname )) self.preprocessed_corpus_fname = self.corpus_fname self.preprocessed_corpus_path = os.path.join( corpus_dir_path, self.preprocessed_corpus_fname )
def __init__( self, corpus_fname, corpus_dir_path='.', encoding='utf-8', need_preprocessing=False, language='english', limit=None ): """Constructor which initializes the BasePreprocessor constructor.""" try: self.logger = LOGGER except NameError: self.logger = log.initialise('T-Vecs.Preprocessor') self.language = language # If language is not specified, regex pattern for split is default '' self.lang_split_sent = defaultdict(lambda: u'') # Specify language specific split regex pattern lang_split_sent = [ ('hindi', u'[।]'), ] # Store language specific regex pattern in the defaultdict for k, v in lang_split_sent: self.lang_split_sent[k] = v self.logger.info('LeipzigPreprocessor utilised') preprocessed_corpus_fname = "%s.preprocessed" % corpus_fname if not os.path.exists( os.path.join(corpus_dir_path, preprocessed_corpus_fname) ): # < -- call function to preprocess leipzig corpus -- > self._leipzig_corpus_preprocess( corpus_fname, corpus_dir_path, encoding ) # < -- call BasePreprocessor Constructor -- > super(LeipzigPreprocessor, self).__init__( corpus_fname=preprocessed_corpus_fname, corpus_dir_path=corpus_dir_path, encoding=encoding, need_preprocessing=False, limit=limit )
def __init__(self, corpus_fname, corpus_dir_path='.', encoding='utf-8', need_preprocessing=False, limit=None): """Constructor initialization for BasePreprocessor.""" try: self.logger = LOGGER except NameError: self.logger = log.initialise('T-Vecs.Preprocessor') self.limit = limit self.corpus_fname = corpus_fname self.corpus_path = os.path.join(corpus_dir_path, self.corpus_fname) self.encoding = encoding if need_preprocessing is True: self.preprocessed_corpus_path = os.path.join( corpus_dir_path, '%s.preprocessed' % corpus_fname) if os.path.exists(self.preprocessed_corpus_path) is False: with codecs.open(self.corpus_path, 'r', encoding=self.encoding) as file: self.logger.debug('Extracting Corpus Data') self._save_preprocessed_data( data=self._extract_corpus_data(data=file.read()), output_fpath=self.preprocessed_corpus_path) self.logger.debug('Saved Intermediate Preprocessed File') else: self.logger.info('Preprocessed Corpus found: %s.preprocessed', corpus_fname) self.preprocessed_corpus_fname = '%s.preprocessed' % corpus_fname else: self.logger.info('Utilising Preprocessed Corpus: %s' % (self.corpus_fname)) self.preprocessed_corpus_fname = self.corpus_fname self.preprocessed_corpus_path = os.path.join( corpus_dir_path, self.preprocessed_corpus_fname)
# -*- coding: utf-8 -*- """Module to map two Vector Spaces using a bilingual dictionary.""" import os import codecs import logging from gensim.models import Word2Vec import scipy.spatial.distance as dist from sklearn.linear_model import RidgeCV from sklearn import metrics from nltk.corpus import wordnet as wn from tvecs.bilingual_generator import bilingual_generator as bg from tvecs.logger import init_logger as log from itertools import chain from sklearn.linear_model import LinearRegression LOGGER = log.initialise('TVecs.VectorSpaceMapper') class VectorSpaceMapper(object): """ Class to map two vector spaces together. - Vector spaces obtained using the two Word2Vec models. - Bilingual Dict used to map semantic embeddings between vector spaces. - Linear Regression utilised for the mapping from :mod:`sklearn.linear_model` API Documentation: :param model_1: Model constructed from Language 1 built using :mod:`tvecs.model_generator.model_generator`. :param model_2: Model constructed from Language 2 built using
* Correlation Coefficient * P Value """ import os import csv import time import codecs from tvecs.evaluation import evaluation from tvecs.logger import init_logger as log from tvecs.model_generator import model_generator from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper LOGGER = log.initialise('TVecs.Multivariate') def evaluate(vsm, wordsim_dataset_path): """Extract Correlation, P-Value for specified vector space mapper.""" return evaluation.extract_correlation_coefficient( score_data_path=wordsim_dataset_path, vsm=vsm) def multivariate_analyse(): """Perform multivariate analysis.""" corpus_size = [54708929, 82063393, 109417858, 136772323] bilingual_size = [4516, 6774, 9032, 11291] dir_path = os.path.join('data', 'evaluate') wordsim_datasets = [('wordsim_relatedness_goldstandard.txt_translate', dir_path),
- Preprocessing Corpus - Implementation of BasePreprocessor module - HcCorpusPreprocessor - Word2Vec Model Building - Gensim Word2Vec SkipGram implementation """ import os import gensim from tvecs.logger import init_logger as log from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor from tvecs.preprocessor.emille_preprocessor import EmilleCorpusPreprocessor from tvecs.preprocessor.leipzig_preprocessor import LeipzigPreprocessor LOGGER = log.initialise('TVecs.ModelGeneration') def generate_model(preprocessor_type, language, corpus_fname, corpus_dir_path='.', output_fname=None, output_dir_path=os.path.join('data', 'models'), need_preprocessing=True, iterations=5): """ Function used to preprocess and generate models. API Documentation :param preprocessor_type: Class Name for preprocessor.
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """Module to map two Vector Spaces using a bilingual dictionary.""" import os import codecs import logging from gensim.models import Word2Vec import scipy.spatial.distance as dist from sklearn.linear_model import RidgeCV from sklearn import metrics from tvecs.bilingual_generator import bilingual_generator as bg from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.VectorSpaceMapper') class VectorSpaceMapper(object): """ Class to map two vector spaces together. - Vector spaces obtained using the two Word2Vec models. - Bilingual Dict used to map semantic embeddings between vector spaces. - Linear Regression utilised for the mapping from :mod:`sklearn.linear_model` API Documentation: :param model_1: Model constructed from Language 1 built using :mod:`tvecs.model_generator.model_generator`. :param model_2: Model constructed from Language 2 built using :mod:`tvecs.model_generator.model_generator`.
def args_parser(): """Utilised for cmdline arguments parsing.""" global order_of_tvex_calls, order_of_evaluation parser = argparse.ArgumentParser( description='Script used to generate models' ) parser.add_argument( "-v", "--verbose", help="increase output verbosity", action="store_true" ) parser.add_argument( "-s", "--silent", help="silence all logging", action="store_true" ) parser.add_argument( "-i", "--iter", help="number of Word2Vec iterations", default=5, action="store" ) parser.add_argument( "-m1", "--model1", dest="model1", help="pre-computed model file path", action="store" ) parser.add_argument( "-m2", "--model2", dest="model2", help="pre-computed model file path", action="store" ) parser.add_argument( "-l1", "--language1", dest="language1", help="language name of model 1/ text 1", action="store" ) parser.add_argument( "-l2", "--l2", dest="language2", help="language name of model 2/ text 2", action="store" ) parser.add_argument( "-c", "--config", dest="config", help="config file path", action="store" ) parser.add_argument( "-b", "--bilingual", dest="bilingual_dict", help="bilingual dictionary path", action="store" ) parser.add_argument( "-r", "--recommendations", dest="recommendations", help="provide recommendations", action="store_true" ) args = parser.parse_args() logger = log.initialise('TVecs') log.set_logger_normal(logger) parse_success = False try: # if Config is given higher priority, cmd line args are overriden if args.config: ( args.language1, args.language2, args.model1, args.model2, args.corpus1, args.corpus2, args.iter, args.silent, args.verbose, args.bilingual_dict ) = parse_config(args.config) if args.verbose is True: log.set_logger_verbose(logger) elif args.silent is True: log.set_logger_silent(logger) valid_model = args.model1 and args.model2 valid_lang = args.language1 and args.language2 # Load a precomputed model for trsl if valid_model and valid_lang and args.bilingual_dict: logger.info( 'Loading Model of %s :%s', args.language1, args.model1 ) model_1 = Word2Vec.load(args.model1) logger.info( 'Loading Model of %s :%s', args.language2, args.model2 ) model_2 = Word2Vec.load(args.model2) order_of_evaluation = order_of_tvex_calls[2:] tvex_calls['model_generator']['result'] = ( model_1, model_2 ) parse_success = True # Build trsl using precomputed word sets and a config file elif args.corpus1 and args.corpus2: order_of_evaluation = order_of_tvex_calls[:] parse_success = True except AttributeError: parse_success = False # Insufficient arguments passed to build trsl if parse_success is False: logger.error( "Required arguments not passed, run --help for more details" ) return old_time = time.time() evaluate(logger, args) tvecs_vm = tvex_calls['vector_space_mapper']['result'] logger.info( 'Evaluation of Training Dataset' ) tvecs_vm.obtain_mean_square_error_from_dataset( dataset_path=args.bilingual_dict ) fpath, fname = ntpath.split(args.bilingual_dict) test_fname = fname.replace('train', 'test') if os.path.exists(os.path.join(fpath, test_fname)): logger.info( 'Evaluation of Testing Dataset' ) tvecs_vm.obtain_mean_square_error_from_dataset( dataset_path=os.path.join(fpath, fname) ) new_time = time.time() loading_time = new_time - old_time logger.info("Execution Time: " + str(loading_time)) if args.recommendations is True: logger.info( "Recommendation Engine: %s => %s" % ( args.language1, args.language2 ) ) while int(raw_input( '\nEnter your Choice:\n1> Recommendation\n2> Exit\n\nChoice: ' )) == 1: word = raw_input( "\nEnter word in Language %s: " % args.language1 ) tvecs_vm.get_recommendations_from_word( word, pretty_print=True )
# -*- coding: utf-8 -*- """ Utilise Yandex Translation Service. - Obtain bilingual semantic human score. """ import os import json import codecs import requests from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.Yandex') def get_valid_translation(word, from_to): """ Ensure the translation is valid. Return only single word translations. If multiple words translations, return None. API Documentation :param word: word to be translated :param from_to: language codes pair representing the src/target lang :type from_to: String :type word: String :return: translated word
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """Module to Evaluate T-Vecs model against Human Semantic Similarity Score.""" import os import codecs from scipy.stats import pearsonr from gensim.models import Word2Vec from tvecs.logger import init_logger as log from tvecs.bilingual_generator import bilingual_generator as bg from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper LOGGER = log.initialise('TVecs.Evaluation') def extract_correlation_coefficient(score_data_path, vsm): """ Extract Human Score, Word1, Word2. Compute T-Vecs Score. API Documentation :param score_data_path: File generated by preprocessor/yandex :param vsm: Vector spaces mapped using 2 models. :type score_data_path: :class:`String` :type vsm: :mod:`tvecs.vector_space_mapper.vector_space_mapper` :return: Returns (Correlation coefficient, P-Value) :rtype: :class:`Tuple(Float, Float)` """ LOGGER.info('Extracting Human Score from score data path: %s', score_data_path) with codecs.open(score_data_path, 'r', encoding='utf-8') as score_file: human_score, calculated_score = zip(*[[
* Correlation Coefficient * P Value """ import os import csv import time import codecs from tvecs.evaluation import evaluation from tvecs.logger import init_logger as log from tvecs.model_generator import model_generator from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper LOGGER = log.initialise('TVecs.Multivariate') def evaluate(vsm, wordsim_dataset_path): """Extract Correlation, P-Value for specified vector space mapper.""" return evaluation.extract_correlation_coefficient( score_data_path=wordsim_dataset_path, vsm=vsm ) def multivariate_analyse(): """Perform multivariate analysis.""" corpus_size = [54708929, 82063393, 109417858, 136772323] bilingual_size = [4516, 6774, 9032, 11291] dir_path = os.path.join(
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """ Utilise Yandex Translation Service. - Obtain bilingual semantic human score. """ import os import json import codecs import requests from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.Yandex') def get_valid_translation(word, from_to): """ Ensure the translation is valid. Return only single word translations. If multiple words translations, return None. API Documentation :param word: word to be translated :param from_to: language codes pair representing the src/target lang :type from_to: String :type word: String :return: translated word
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """EMILLE Corpus Preprocessor which inherits from BasePreprocessor.""" import regex as re from bs4 import BeautifulSoup from collections import defaultdict from nltk.tokenize import sent_tokenize from tvecs.preprocessor.base_preprocessor import BasePreprocessor from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.Preprocessor') class EmilleCorpusPreprocessor(BasePreprocessor): """ Emille Corpus Preprocessor which preprocesses the EMILLE Corpus. API Documentation: :param corpus_fname: Corpus Filename to be preprocessed :param corpus_dir_path: Corpus Directory Path [ Default Current Directory ] :param encoding: Encoding format of the corpus [ Default utf-8 ] :param language: Language of the model constructed [ Default English ] :param limit: Number of tokenized words to be limited to [ Default None ] :param need_preprocessing: Preprocess corpus to obtain only the valid content from the file to an intermediate file [ False - Corpus has each sentence in seperate lines ]
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """Module used to generate bilingual dictionary.""" import os import random import codecs from gensim.models import Word2Vec from tvecs.bilingual_generator import cluster as cl from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.BilingualDictionary') def load_bilingual_dictionary(bilingual_dictionary_path, encoding='utf-8'): """ Load bilingual dictionary from the specified bilingual_dictionary_path. API Documentation :param bilingual_dictionary_path: Path for Bilingual Dictionary. :param encoding: Encoding of the bilingual dictionary. :type bilingual_dictionary_path: :class:`String` :type encoding: :class:`String` :return: Bilingual Dictionary loaded. :rtype: :class:`List` """ LOGGER.info( 'Loading Bilingual Dictionary: %s', bilingual_dictionary_path )
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """HC Corpus Preprocessor which inherits from BasePreprocessor.""" import regex as re from collections import defaultdict from nltk.tokenize import sent_tokenize from tvecs.preprocessor.base_preprocessor import BasePreprocessor from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.Preprocessor') class HcCorpusPreprocessor(BasePreprocessor): """ Hc-Corpus Preprocessor which preprocesses the Hc-Corpus. API Documentation: :param corpus_fname: Corpus Filename to be preprocessed :param corpus_dir_path: Corpus Directory Path [ Default Current Directory ] :param encoding: Encoding format of the corpus [ Default utf-8 ] :param language: Language of the model constructed [ Default English ] :param limit: Number of tokenized words to be limited to [ Default None ] :param need_preprocessing: Preprocess corpus to obtain only the valid content from the file to an intermediate file [ False - Corpus has each sentence in seperate lines ] :type corpus_fname: :class:`String`
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """Module to Evaluate T-Vecs model against Human Semantic Similarity Score.""" import os import codecs from scipy.stats import pearsonr from gensim.models import Word2Vec from tvecs.logger import init_logger as log from tvecs.bilingual_generator import bilingual_generator as bg from tvecs.vector_space_mapper.vector_space_mapper import VectorSpaceMapper LOGGER = log.initialise('TVecs.Evaluation') def extract_correlation_coefficient(score_data_path, vsm): """ Extract Human Score, Word1, Word2. Compute T-Vecs Score. API Documentation :param score_data_path: File generated by preprocessor/yandex :param vsm: Vector spaces mapped using 2 models. :type score_data_path: :class:`String` :type vsm: :mod:`tvecs.vector_space_mapper.vector_space_mapper` :return: Returns (Correlation coefficient, P-Value) :rtype: :class:`Tuple(Float, Float)` """ LOGGER.info( 'Extracting Human Score from score data path: %s', score_data_path ) with codecs.open(score_data_path, 'r', encoding='utf-8') as score_file:
- Preprocessing Corpus - Implementation of BasePreprocessor module - HcCorpusPreprocessor - Word2Vec Model Building - Gensim Word2Vec SkipGram implementation """ import os import gensim from tvecs.logger import init_logger as log from tvecs.preprocessor.hccorpus_preprocessor import HcCorpusPreprocessor from tvecs.preprocessor.emille_preprocessor import EmilleCorpusPreprocessor from tvecs.preprocessor.leipzig_preprocessor import LeipzigPreprocessor LOGGER = log.initialise('TVecs.ModelGeneration') def generate_model( preprocessor_type, language, corpus_fname, corpus_dir_path='.', output_fname=None, output_dir_path=os.path.join('data', 'models'), need_preprocessing=True, iterations=5 ): """ Function used to preprocess and generate models.
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """Module used to generate bilingual dictionary.""" import re import os import random import codecs from gensim.models import Word2Vec from tvecs.bilingual_generator import cluster as cl from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.BilingualDictionary') def load_bilingual_dictionary(bilingual_dictionary_path, encoding='utf-8'): """ Load bilingual dictionary from the specified bilingual_dictionary_path. API Documentation :param bilingual_dictionary_path: Path for Bilingual Dictionary. :param encoding: Encoding of the bilingual dictionary. :type bilingual_dictionary_path: :class:`String` :type encoding: :class:`String` :return: Bilingual Dictionary loaded. :rtype: :class:`List` """ LOGGER.info( 'Loading Bilingual Dictionary: %s', bilingual_dictionary_path )
#!/usr/bin/env python2.7 # -*- coding: utf-8 -*- """Preprocess Evaluation Dataset by translating 1 column.""" import os import codecs from tvecs.logger import init_logger as log from tvecs.preprocessor import yandex_api as yp LOGGER = log.initialise('TVecs.Evaluation.Preprocess') def preprocess_dataset(dataset_path, delimiter='', encoding='utf-8'): """Preprocess Evaluation dataset by preprocessing 1 column.""" with codecs.open(dataset_path, 'r', encoding=encoding) as dataset_handle: LOGGER.info('Opening Evaluation Dataset for Preprocessing.') data = dataset_handle.read().split() output_data = [] for line in data: word_1, word_2, score = line.split(delimiter) t_word = yp.get_translation(word_2, 'en-hi').split() if len(t_word) > 0: t_word = t_word[0] processed_data = "\t".join([word_1, t_word, score]) LOGGER.debug( '%s : %s' % ('Preprocessed Evaluation Dataset', preprocess_dataset)) output_data.append(processed_data) with codecs.open('%s_%s' % (dataset_path, 'translate'), 'w', encoding=encoding) as output_handle:
"""Test.""" import os import json import codecs from sklearn.cluster import AffinityPropagation from tvecs.logger import init_logger as log LOGGER = log.initialise('TVecs.Clustering') def build_clusters(entire_word_list, model, damping_factor=0.5): """ Cluster word_list using Affinity Propagation. - Clustering based on the vectors from the Word2Vec model. API Documentation: :param entire_word_list: Word List provided to cluster. :param model: Model to obtain the vectors for the word_list. :param damping_factor: Damping factor for the affinity propagation. :type entire_word_list: :class:`List` :type model: :mod:`gensim.models.Word2Vec` :type damping_factor: :class:`Float` """ vocab = set(entire_word_list) vocab_dict = {} for word in vocab: try: vocab_dict[word] = model[word] except KeyError: