def save_vectors_file():
    data = load_data(FLAGS.data_path)

    vectorizer = Vectorizer()

    logging.info('getting vectors')
    img_vectors = []
    genders = []
    for img_path, gender_id in tqdm(data.items()):
        try:
            img_array = get_img(img_path)

            vector = vectorizer.get_vector(img_array)

            img_vectors.append(vector)
            genders.append(gender_id)
        except Exception as e:
            logging.warning('exception: {}'.format(e))

    vectorizer.close()

    dim_reduction_technique = get_dim_reduction_technique(
        FLAGS.dim_reduction_technique)

    reduced, model = dim_reduction_technique(img_vectors, FLAGS.n_dimensions)

    save_pkl_file(model, FLAGS.reducter_path)
    save_pkl_file((reduced, genders), FLAGS.vectors_path)
Esempio n. 2
0
    def classify(self):
        # Classifies unknown forum posts
        if not self.fit:
            print("Fitting must be performed before classifying")
            return

        vectorizer = Vectorizer(self.dictionary.dictionary)
        input_file = input(
            "Enter the name of the .txt file containing the unknown posts (including file-ending: "
        )
        try:
            with open(input_file, "r") as file:
                vectors = vectorizer.vectorize(
                    self.preprocessor.preprocess(file))
        except FileNotFoundError:
            if input("File not found. Press enter to try again or type 'm' and press enter to return to menu.").lower()\
                    == "m":
                return
            self.classify()
            return

        with open("result.txt", "w") as result_file:
            for line in self.classifier.classify(vectors):
                result_file.write((label_list[line] + "\n"))
        print(
            "Result saved in result.txt. " +
            "The predicted label of each post is printed on the corresponding line of the document."
        )
    def setUp(self):
        self.vec = Vectorizer(layer=-1, backend='gpu', cores=32)

        # Generate a list of images
        base_image = os.path.expanduser(
            '~') + '/SaturnServer/test_resources/map_image'
        self.imagenames = []
        for i in range(1, self.vec.cores + 1):
            self.imagenames.append("{}{}.jpg".format(base_image, i))
    def cosineScore(self, vector1, vector2):
        # calculate dot product
        dotProduct = self.getDotProduct(vector1,vector2)
                
        # get magnitudes
        magnitudes = Vectorizer.getMagnitude(vector1) * Vectorizer.getMagnitude(vector2)
 
        if magnitudes == 0:
            magnitudes = 0 + sys.float_info.epsilon #the smallest possible value. avoid divide by zero error
        return 1 - (dotProduct/magnitudes)
class TopicEmbeddingModel():
    '''
    Wrapper class for different topic models
    
    '''
    def __init__(self,folder='model',modeltype='kpca',topics=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf'])
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics

        if self.modeltype is 'kpca':
            from sklearn.decomposition import KernelPCA
            self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics)
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)

    def fit(self,X):
        '''
        fits a topic model

        INPUT
        X   list of strings
        '''

        # transform list of strings into sparse BoW matrix
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].fit_transform(\
        #    self.bow['count_vectorizer'].fit_transform(X))

        # depending on the model, train
        if self.modeltype is 'kpca':
            Xc = self.model.fit_transform(X)
        if self.modeltype is 'nmf':
            Xc = self.model.fit_transform(X)


    def predict(self,X):
        '''
        predicts cluster assignment from list of strings
        
        INPUT
        X   list of strings
        '''
        if X is not list: X = [X]
        X = self.bow.transform(X)
        #X = self.bow['tfidf_transformer'].transform(\
        #    self.bow['count_vectorizer'].transform(X))
        
        if self.modeltype is 'kpca':
            return self.model.transform(X)
        if self.modeltype is 'nmf':
            return self.model.transform(X)
    def __init__(self):

        # vectorizer class
        # based on composition instead of inheritence principles
        self.vectorizer = Vectorizer()

        # weights learned and used by model
        self.weights = np.array([])
        self.tag_enums = []

        self.tag_dict = {}
    def __init__(self,folder='model',modeltype='kpca',topics=10):
        # the classifier, which also contains the trained BoW transformer
        self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf'])
        self.folder = folder
        self.modeltype = modeltype
        self.topics = topics

        if self.modeltype is 'kpca':
            from sklearn.decomposition import KernelPCA
            self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics)
        if self.modeltype is 'nmf':
            from sklearn.decomposition import NMF
            self.model = NMF(n_components=topics)
Esempio n. 8
0
def _get_token_similarity(query_string, pred_string):
    query_tokens = Tokenizer.tokenize(query_string)
    pred_tokens = Tokenizer.tokenize(pred_string)
    pred_vec = dict(zip(pred_tokens, Vectorizer.vectorize_tokens(pred_tokens)))
    query_vec = dict(
        zip(query_tokens, Vectorizer.vectorize_tokens(query_tokens)))
    ret = {}
    for k, v in query_vec.items():
        dist = cdist([v],
                     np.stack(list(pred_vec.values()), axis=0),
                     metric="cosine")[0]
        idx = dist.argsort()[:2]
        ret.update({k: list(np.asarray(list(pred_vec.keys()))[idx])})
    return ret
Esempio n. 9
0
def main(filename, category_filename, answer_col, predictor_col, hidden_nodes):
    df = pd.read_csv(filename, usecols=[answer_col, predictor_col])
    categories = pd.read_csv(category_filename,
                             usecols=[predictor_col])[predictor_col].values
    vectorizer = Vectorizer(df, categories, predictor_col, answer_col)
    vectorizer.format(0.6, 0.2)

    batch_size = 1000
    epocs = 50
    learning_rate = 1e-3
    model = build_and_train(vectorizer, batch_size, epocs, learning_rate,
                            hidden_nodes)
    validate(model, vectorizer)
    joblib.dump(model, filename + '.joblib')
Esempio n. 10
0
def main():
    img = get_img(FLAGS.img_path)

    vectorizer = Vectorizer()
    vector = vectorizer.get_vector(img)
    vectorizer.close()

    reducter = load_pkl_file(FLAGS.reducter_path)
    reduced = reducter.transform([vector])

    model = load_pkl_file(FLAGS.model_path)

    output = model.predict(reduced)[0]

    print('result: {}'.format(output))
Esempio n. 11
0
    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None
Esempio n. 12
0
def startAnalysis(folder, S1_path, S2_path):

    fetcher = PageFetcher()
    S1 = fetcher.fetchPages(folder, S1_path)
    S2 = fetcher.fetchPages(folder, S2_path)

    #We use a document representation based on TF-IDF model
    TF_IDF = Vectorizer()
    S1_HTML = TF_IDF.fit_transform(S1)
    S2_HTML = TF_IDF.fit_transform(S2)
    pageAllignament = PageAllignament()
    S1S2_Pairs = pageAllignament.allignSources(S1_HTML, S2_HTML)

    print("Stats of: " + str(S1_path) + " and " + str(S2_path))
    evaluation_pipeline(S1S2_Pairs)
Esempio n. 13
0
def get_most_similar_title(query_title, df, top_n=5):
    logger.info(f"Query: \t\t {query_title}")
    v0, tokens = Vectorizer.vectorize_sent(query_title, get_tokens=True)
    logger.info(f"Processed Query: {' '.join(tokens)}\n")
    dist = cdist([v0], np.stack(df.title_vect.values, axis=0),
                 metric='cosine')[0]
    idx = dist.argsort()[:top_n]
    values = df[[
        "title", "abstract", "publish_time", "authors", "journal", "source_x",
        "url"
    ]].loc[idx].to_dict("records")
    ret = dict({
        "query": query_title,
        "processed_query": ' '.join(tokens),
        "pred": {}
    })
    for n, i, each in zip(range(1, top_n + 1), idx, values):
        tok_sim = _get_token_similarity(" ".join(tokens), each["title"])
        ret["pred"].update({
            n: {
                "score": round((1.0 - dist[i]), 5),
                "title": each["title"],
                "abstract": each["abstract"],
                "publish_time": each["publish_time"],
                "authors": each["authors"],
                "journal": each["journal"],
                "source_x": each["source_x"],
                "url": each["url"],
                #                 "token_similarity":tok_sim
            }
        })
    return ret
Esempio n. 14
0
def create_tf_idf(file_path):
    reader = TrainingTextReader(file_path)
    keywords = KeywordExtractor(reader.articles[10], 'useless.txt')
    vector_index = Vectorizer(keywords.article_sents_tokened)
    freq_mat = vector_index.frequencyMatrix
    normalized_vector = VectorNormalizer(freq_mat)
    norm_mat = normalized_vector.l2_norm_matrice
    tf_idf = InverseDocumentFrequency(norm_mat)
    return tf_idf.tf_idf_matrice
Esempio n. 15
0
    def test_regression__vectorizer_layer_minus_one_behaves_same(self):
        # GIVEN a layer to test
        layer_under_test = -1

        # AND a vectorizer that uses that layer
        vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu')

        # AND an expected output
        expected_output = [0.0016, 0.9883, 0.0099, 0.00]

        #
        # WHEN extracting the attributes from an image
        print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime())
        actual_output = roundArray(vec.get_attribute_vector(image_loc))

        #
        # THEN the output is as expected
        self.assertEqual(expected_output, actual_output, 'The output %s, does not match the expected output of %s' % (str(actual_output), str(expected_output)))
Esempio n. 16
0
def main():
    with timer("model loading"):
        # モデルとパイプラインの読込
        model = ModelMLP()
        model.load_model()
        vectorizer = Vectorizer()
        vectorizer.load_vectorizer()

    with timer("data loading"):
        # 予測対象のデータをロード
        df = load_data_from_gcs()

    with timer("preprocess"):
        df = preprocess(df)

    with timer("predict"):
        X = df.drop(columns="price")
        X = vectorizer.transform(X)
        pred = model.predict(X)

        print(pred[:10])
Esempio n. 17
0
    def __init__(self, folder='model', train=False):
        '''
        Creates a classifier object
        if no model is found, or train is set True, a new classifier is learned

        INPUT
        folder  the root folder with the Bag-of-Word data, where the model is stored
        train   set True if you want to train 

        '''
        self.folder = folder
        # load Bag-of-Word extractor
        self.bow_vectorizer = Vectorizer(self.folder)
        # if there is no classifier file or training is invoked
        if (not os.path.isfile(self.folder + '/classifier.pickle')) or train:
            print 'Training classifier'
            self.train()
        print 'Loading classifier'
        clfdict = cPickle.load(open(self.folder + '/classifier.pickle'))
        self.clf = clfdict['classifier']
        self.parties = clfdict['labels']
Esempio n. 18
0
    def _load_data(self, data_dir, word_tokens, pristine_input,
                   pristine_output, batch_size, seq_length, seq_step):
        try:
            with open(os.path.join(data_dir, 'input.txt'),
                      encoding='utf-8') as input_file:
                text = input_file.read()
        except FileNotFoundError:
            print_red("No input.txt in data_dir")
            sys.exit(1)

        skip_validate = True
        try:
            with open(os.path.join(data_dir, 'validate.txt')) as validate_file:
                text_val = validate_file.read()
                skip_validate = False
        except FileNotFoundError:
            pass  # Validation text optional

        # Find some good default seed string in our source text.
        self.seeds = find_random_seeds(text)
        # Include our validation texts with our vectorizer
        all_text = text if skip_validate else '\n'.join([text, text_val])
        self.vectorizer = Vectorizer(all_text, word_tokens, pristine_input,
                                     pristine_output)

        data = self.vectorizer.vectorize(text)
        x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step)
        print('x.shape:', x.shape)
        print('y.shape:', y.shape)

        if skip_validate:
            return x, y, None, None

        data_val = self.vectorizer.vectorize(text_val)
        x_val, y_val = shape_for_stateful_rnn(data_val, batch_size, seq_length,
                                              seq_step)
        print('x_val.shape:', x_val.shape)
        print('y_val.shape:', y_val.shape)
        return x, y, x_val, y_val
Esempio n. 19
0
    def start(self):
        bag_of_words, words = TermFrequency(self.trained).create_vocabulary()

        v = Vectorizer(self.trained, self.classify, words, bag_of_words)

        tfidf_trained = v.tfidf_for_tweets_trained
        evaluations = v.evaluations
        tfidf_to_classify = v.tfidf_for_tweets_to_classify

        models = Models(tfidf_trained, evaluations, tfidf_to_classify)
        prediction = models.svm_linear()

        return prediction
Esempio n. 20
0
def main():
    # 学習データ読み込み
    with timer("train data load"):
        df = load_data_from_gcs()

    # 前処理
    with timer("preprocess"):
        df = preprocess(df)
        vectorizer = Vectorizer()

    X_train = df.drop(columns="price")
    y_train = df["price"]

    with timer("training"):
        X_train = vectorizer.fit_transform(X_train)

        # 学習
        base_params = {
            'input_dropout': 0.2,
            'hidden_layers': 3,
            'hidden_units': 256,
            'hidden_activation': 'relu',
            'hidden_dropout': 0.2,
            'batch_norm': 'before_act',
            'optimizer': {
                'type': 'adam',
                'lr': 5e-5
            },
            'batch_size': 64,
        }

        model = ModelMLP(base_params)
        model.fit(X_train, y_train)

    with timer("save model"):
        #モデルとパイプラインの保存
        vectorizer.save_vectorizer()
        model.save_model()
Esempio n. 21
0
    def preprocess_and_fit(self):
        # Method that preprocesses data, indexes all words, vectorizes posts and finally trains and tests the classifier
        processed = []
        processed_test = []
        for category in self.categories:
            processed.append(
                self.preprocessor.preprocess('training' + str(category) +
                                             ".txt"))
            processed_test.append(
                self.preprocessor.preprocess('testing' + str(category) +
                                             ".txt"))

        # Word indexing
        for category in processed:  # indexes all words into dictionary
            self.dictionary.index_words(category)
        print("Words indexed. Dictionary size: ",
              len(self.dictionary.dictionary), " words")

        # Vectorization
        vectorizer = Vectorizer(
            self.dictionary.dictionary
        )  # initializes vectorizer-object with dictionary
        vector_start = time.time()
        print("Vectorizing...")
        training_vectors = []
        testing_vectors = []
        for category in processed:
            training_vectors.append(vectorizer.vectorize(category))
        for category in processed_test:
            testing_vectors.append(vectorizer.vectorize(category))
        vector_time = time.time() - vector_start
        print("Vectorization completed in ", ("%.2f" % vector_time), "seconds")

        # Training and evaluation
        self.classifier.train(training_vectors)
        self.fit = True
        self.classifier.evaluate(testing_vectors)
Esempio n. 22
0
    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None
Esempio n. 23
0
    def test_regression__vectorizer_layer_minus_four_behaves_same(self):
        # GIVEN a layer to test
        layer_under_test = -4

        # AND a vectorizer that uses that layer
        vec = Vectorizer(layer=layer_under_test, prm_path=default_prm_path, backend='cpu')

        # AND an expected output stored in a file
        expected_output_file_path = os.path.expanduser('~')+'/SaturnServer/test_resources/layer4results.txt'

        #
        # WHEN extracting the attributes from an image
        print 'This test has not stalled, it takes 20-40 seconds on an fast-ish computer (%s)' % strftime("%H:%M:%S", gmtime())
        actual_output = roundArray(vec.get_attribute_vector(image_loc))

        #
        # THEN each element of the actual output array must match each element of the expected results
        with open(expected_output_file_path, 'r') as expected_output_file:
            element_no = 0
            for expected_element in expected_output_file:
                self.assertEqual(float(expected_element), actual_output[element_no],
                                 'The output (element %d) %s, does not match the expected output of %s'
                                 % (element_no, str(actual_output[element_no]), str(expected_element)))
                element_no += 1
Esempio n. 24
0
def calculate_cooccurrence(config):
    with open(config.input_filepath, "rb") as f:
        corpus = pickle.load(f)
    vectorizer = Vectorizer.from_corpus(
        corpus=corpus,
        vocab_size=config.vocab_size
    )
    cooccurrence = CooccurrenceEntries.setup(
        corpus=corpus,
        vectorizer=vectorizer
    )
    cooccurrence.build(
        window_size=config.window_size,
        num_partitions=config.num_partitions,
        chunk_size=config.chunk_size,
        output_directory=config.cooccurrence_dir
    ) 
    def __init__(self,folder='model',train=False):
        '''
        Creates a classifier object
        if no model is found, or train is set True, a new classifier is learned

        INPUT
        folder  the root folder with the Bag-of-Word data, where the model is stored
        train   set True if you want to train 

        '''
        self.folder = folder
        # load Bag-of-Word extractor
        self.bow_vectorizer = Vectorizer(self.folder)
        # if there is no classifier file or training is invoked
        if (not os.path.isfile(self.folder+'/classifier.pickle')) or train:
            print 'Training classifier'
            self.train()
        print 'Loading classifier'
        clfdict = cPickle.load(open(self.folder+'/classifier.pickle'))
        self.clf = clfdict['classifier']
        self.parties = clfdict['labels']
Esempio n. 26
0
def vectorize_jobs(df_jobs, vectorizer_path, tfidfs_path, debug=False):
    #initializing tfidf vectorizer
    if debug:
        print('[Job Vectorization 2/5] Initializing Vectorizer \n')
    vectorizer = Vectorizer()

    if debug:
        print('[Job Vectorization 3/5] Tranforming/Vectorizing data \n')
    tfidf_jobs = vectorizer.fit_transform(
        (df_jobs['text']))  #fitting and transforming the vector

    if debug:
        print('[Job Vectorization 4/5] saving vectorizer to {path} \n'.format(
            path=vectorizer_path))
    vectorizer.save_vectorizer(vectorizer_path)

    if debug:
        print('[Job Vectorization 5/5] saving tfidf to {path} \n'.format(
            path=tfidfs_path))
    vectorizer.save_tfidfs(tfidf_jobs, tfidfs_path)
Esempio n. 27
0
    ix_to_rel = {i: r for i, r in enumerate(rel_set)}
    num_words = len(word_set)
    num_tags = len(tag_set)
    num_rels = len(rel_set)

    ROOT_TAG = "root"

    WORD_SIZE = 100
    TAG_SIZE = 30
    HIDDEN_SIZE = 100
    NUM_EPOCHS = 3

    word_vectorizer = Vectorizer(WordExtractor(sents),
                                 None,
                                 "parser_word",
                                 WORD_SIZE,
                                 filler=ZeroFiller(WORD_SIZE),
                                 ce_enabled=False,
                                 tf_enabled=False)
    tag_vectorizer = Vectorizer(TagExtractor(sents),
                                None,
                                "parser_pos",
                                TAG_SIZE,
                                filler=ZeroFiller(TAG_SIZE),
                                ce_enabled=False,
                                tf_enabled=False)

    parser = SyntaxParser(num_words, WORD_SIZE, num_tags, TAG_SIZE,
                          WORD_SIZE + TAG_SIZE, HIDDEN_SIZE, num_rels)
    optimizer = optim.SGD(parser.parameters(), lr=0.1)
    loss_function = nn.NLLLoss()
	sorted_article_list = []
	for article, score in relevance_sorted_articles:
		print "Id: ", article.id_num
		print "Link: ", article.link
		print "Description: ", article.description
		print article.title ,":",score
		print '\n\n\n'
		sorted_article_list.append(article)
	return sorted_article_list



#==================FINDING TRENDING ARTICLES=================
trending_articles = findTrending(PICKELED_RECENT_ARTICLES_ALL_TOPICS)

vectorizer = Vectorizer()
vectorized_trending_articles = vectorizer.vectorize(trending_articles)

setArticleVectors(trending_articles, vectorized_trending_articles)

# for article in trending_articles:
# 	print article.description, article.vector

dimensions = str(len(vectorized_trending_articles)) + " x " + str(len(vectorized_trending_articles[0]))
print "Term document matrix with" +  dimensions + ": \n", vectorized_trending_articles

#==================KMEANS STARTS HERE=======================
# print "Calculating kmeans..."

# kmeans_calculator = KMeansClusterer()
Esempio n. 29
0
PREPROCESSOR = Preprocessor(thesaurus_path)  # シソーラス・パスを渡さなければ置換をしません。
print('前処理を行います')
PREPROCESSOR.load_text([text_path])
whitelist = PREPROCESSOR.investigate_whitelist(thesaurus_path)
print('保存します')
PREPROCESSOR.save(auto_text_path)
PARSER = Parser()
print('かかり受け解析を行います..')
PARSER.t2f([auto_text_path + '/' + root + '.text'],
           kytea_model=kytea_path,
           eda_model=eda_path)
print('結果を保存します')
PARSER.save(tree_path)  # かかり受け解析したものをファイルに保存
print("Indexを読み込みます...")
VECTORIZER = Vectorizer(index_path, t=1, list=whitelist)  # Indexの読み込み
print('Treeを読み込みます')
vectors = VECTORIZER.get_vector([tree_path + '/' + root + '.eda'],
                                filter=3)  # ベクトルを生成
print(vectors)
print('Vectorを保存します')
VECTORIZER.save(vectors, [vector_path])  # ベクトルを保存

#-----
# いまもっているTFIDFコーパスベクトル群と、クエリベクトルtfidf_vectorsを比較
#----

print('TFIDF corpus Vectorsを読み込みます')
tfidf_corpus_vectors = VECTORIZER.load(
    sorted(glob.glob(tfidf_DB_path + '/*.vector')))
print(tfidf_corpus_vectors)
Esempio n. 30
0
print('かかり受け解析を行います..')
PARSER.t2f(sorted(glob.glob(auto_text_path + '/*')),
           kytea_model=kytea_path,
           eda_model=eda_path)  # text_pathのファイルをかかり受け解析
print('結果を保存します')
PARSER.save(tree_path)  # かかり受け解析したものをファイルに保存
INDEX = Index(unigram=1, dep_trigram=1, bigram=1,
              dep_bigram=1)  # Indexをunigramとbigramの素性を、treeから読み出すことでIndexを作成する
print('Treeを読み込みます')
INDEX.add_index(sorted(glob.glob(tree_path +
                                 '/*')))  # tree_pathのフォルダ以下のファイルからインデックスを作る
print('INDEXを保存します...')
INDEX.save(index_path)  # index_pathにインデックスを保存
print(index_path)
print("Indexを読み込みます...")
VECTORIZER = Vectorizer(index_path, t=1, list=whitelist)  # Indexの読み込み  # 閾値は1
print('Treeを読み込みます')
vectors = VECTORIZER.get_vector(sorted(glob.glob(tree_path + '/*')),
                                filter=3)  # ベクトルを生成
print(vectors)
print('Vectorを保存します')
filename_list = sorted(glob.glob(tree_path + '/*'))
vector_path_list = []
for filename in filename_list:
    base_name = os.path.basename(filename)  # A.text
    root = os.path.splitext(base_name)[0]  # A
    file_name = vector_folder_path + '/' + root + '.vector'
    vector_path_list.append(file_name)
VECTORIZER.save(vectors, vector_path_list)  # ベクトルを保存
print(vector_path_list)
Esempio n. 31
0
    def __init__(self, name, sents, vectorizer_words, vectorizer_forms, embedding_size,
                 tag_sents, tag_embedding_size, context_size,
                 lrs=(0.1, 0.1, 0.1), lr_decrease_factor=0.5, epochs_per_decrease=10):
        ######################################################################
        # Model's parameters.
        # 'sents' is a list of sentences of tuples ((form, word, tag), rel, head)
        self.name = name
        self.sents = sents
        self.embedding_size = embedding_size
        self.context_size = context_size

        ######################################################################
        # Load or create indices.
        # Common
        self.path_base = "internal"
        self.num_words = 0
        self.root_tag = "root"

        # CUDA flag
        self.is_cuda_available = torch.cuda.is_available()

        # For POS tags:
        self.tags = set()
        self.num_tags = 0
        self.tag2index = {}
        self.index2tag = {}

        # For chunk tags:
        self.chunks = set()
        self.num_chunks = 0
        self.chunk2index = {}
        self.index2chunk = {}

        # For relation tags:
        self.rels = set()
        self.num_rels = 0
        self.rel2index = {}
        self.index2rel = {}

        # Update database
        self.create_or_load_indices()
        if self.num_words == 0:
            self.num_words = self.get_num_words(self.sents)

        ######################################################################
        # Logic.
        # Learning rate controls
        self.lrs = lrs
        self.lr_decrease_factor = lr_decrease_factor
        self.epochs_per_decrease = epochs_per_decrease

        # Define machines
        self.vectorizer = Vectorizer(vectorizer_words, vectorizer_forms, name,
                                     embedding_size, filler=ZeroFiller(embedding_size),
                                     ce_enabled=True)

        # self.vectorizer = FastTextVectorizer(name, embedding_size * 2, "ft_sg_syntagrus.bin")

        self.tag_vectorizer = Vectorizer(tag_sents, None, name + "_pos",
                                         tag_embedding_size, filler=ZeroFiller(tag_embedding_size),
                                         ce_enabled=False, tf_enabled=False)

        # Tags embeddings (H).
        # Chunker will get linear combination as an input:
        #    I = H^T * p
        #    p - probabilities vector
        self.tag_embeddings = []
        for i in range(self.num_tags):
            tag = self.index2tag[i].lower()
            self.tag_embeddings.append(self.tag_vectorizer(tag, tag))
        self.tag_embeddings = torch.stack(self.tag_embeddings)
        if self.is_cuda_available:
            self.tag_embeddings = self.tag_embeddings.cuda()

        # Vector size is 1 (TF) + 100 (Word embedding) + 100 (Char grams embedding)
        self.vector_size = self.vectorizer.get_vector_size()

        self.tag_size = self.tag_vectorizer.get_vector_size()

        # Chunk size.
        # Benchmark is 200 (POS hidden) + 201 (embedding) + NUM_TAGS (probabilities)
        self.chunk_size = 2 * embedding_size + self.vector_size + self.tag_size

        # Parse size -- input size for parser.
        # When chunking is not available, parse size is equal to chunk size
        self.parse_size = self.chunk_size

        self.log("tagger input size: {}".format(self.vector_size))
        self.log("chunker input size: {}".format(self.chunk_size))
        self.log("parser input size: {}".format(self.parse_size))

        self.tagger = Tagger(self.vector_size, self.num_tags, "GRU", embedding_size)
        # self.chunker = Tagger(self.chunk_size, self.num_chunks, "LSTM", embedding_size)
        self.parser = SyntaxParser(0, 0, 0, 0, self.parse_size, embedding_size, self.num_rels)

        self.is_tagger_trained = False
        # self.is_chunker_trained = False
        self.is_parser_trained = False

        self.tagger_name = "pos tagging"
        # self.chunker_name = "chunking"
        self.parser_name = "parsing"

        # Try to load from file
        self.tagger_path = "{}/model_pos_{}.pt".format(self.path_base, self.name)
        # self.chunker_path = "{}/model_chunk_{}.pt".format(self.path_base, self.name)
        self.parser_path = "{}/model_parse_{}.pt".format(self.path_base, self.name)

        if os.path.exists(self.tagger_path):
            self.log("Loading POS tagger")
            self.tagger = torch.load(self.tagger_path)
            self.tagger.unit.flatten_parameters()
            self.is_tagger_trained = True
            self.log("Done")

        # if os.path.exists(self.chunker_path):
        #     self.log("Loading chunker")
        #     self.chunker = torch.load(self.chunker_path)
        #     self.chunker.unit.flatten_parameters()
        #     self.is_chunker_trained = True
        #     self.log("Done")

        if os.path.exists(self.parser_path):
            self.log("Loading parser")
            self.parser = torch.load(self.parser_path)
            self.parser.unit.flatten_parameters()
            self.is_parser_trained = True
            self.log("Done")
Esempio n. 32
0
class Model:
    def __init__(self, name, sents, vectorizer_words, vectorizer_forms, embedding_size,
                 tag_sents, tag_embedding_size, context_size,
                 lrs=(0.1, 0.1, 0.1), lr_decrease_factor=0.5, epochs_per_decrease=10):
        ######################################################################
        # Model's parameters.
        # 'sents' is a list of sentences of tuples ((form, word, tag), rel, head)
        self.name = name
        self.sents = sents
        self.embedding_size = embedding_size
        self.context_size = context_size

        ######################################################################
        # Load or create indices.
        # Common
        self.path_base = "internal"
        self.num_words = 0
        self.root_tag = "root"

        # CUDA flag
        self.is_cuda_available = torch.cuda.is_available()

        # For POS tags:
        self.tags = set()
        self.num_tags = 0
        self.tag2index = {}
        self.index2tag = {}

        # For chunk tags:
        self.chunks = set()
        self.num_chunks = 0
        self.chunk2index = {}
        self.index2chunk = {}

        # For relation tags:
        self.rels = set()
        self.num_rels = 0
        self.rel2index = {}
        self.index2rel = {}

        # Update database
        self.create_or_load_indices()
        if self.num_words == 0:
            self.num_words = self.get_num_words(self.sents)

        ######################################################################
        # Logic.
        # Learning rate controls
        self.lrs = lrs
        self.lr_decrease_factor = lr_decrease_factor
        self.epochs_per_decrease = epochs_per_decrease

        # Define machines
        self.vectorizer = Vectorizer(vectorizer_words, vectorizer_forms, name,
                                     embedding_size, filler=ZeroFiller(embedding_size),
                                     ce_enabled=True)

        # self.vectorizer = FastTextVectorizer(name, embedding_size * 2, "ft_sg_syntagrus.bin")

        self.tag_vectorizer = Vectorizer(tag_sents, None, name + "_pos",
                                         tag_embedding_size, filler=ZeroFiller(tag_embedding_size),
                                         ce_enabled=False, tf_enabled=False)

        # Tags embeddings (H).
        # Chunker will get linear combination as an input:
        #    I = H^T * p
        #    p - probabilities vector
        self.tag_embeddings = []
        for i in range(self.num_tags):
            tag = self.index2tag[i].lower()
            self.tag_embeddings.append(self.tag_vectorizer(tag, tag))
        self.tag_embeddings = torch.stack(self.tag_embeddings)
        if self.is_cuda_available:
            self.tag_embeddings = self.tag_embeddings.cuda()

        # Vector size is 1 (TF) + 100 (Word embedding) + 100 (Char grams embedding)
        self.vector_size = self.vectorizer.get_vector_size()

        self.tag_size = self.tag_vectorizer.get_vector_size()

        # Chunk size.
        # Benchmark is 200 (POS hidden) + 201 (embedding) + NUM_TAGS (probabilities)
        self.chunk_size = 2 * embedding_size + self.vector_size + self.tag_size

        # Parse size -- input size for parser.
        # When chunking is not available, parse size is equal to chunk size
        self.parse_size = self.chunk_size

        self.log("tagger input size: {}".format(self.vector_size))
        self.log("chunker input size: {}".format(self.chunk_size))
        self.log("parser input size: {}".format(self.parse_size))

        self.tagger = Tagger(self.vector_size, self.num_tags, "GRU", embedding_size)
        # self.chunker = Tagger(self.chunk_size, self.num_chunks, "LSTM", embedding_size)
        self.parser = SyntaxParser(0, 0, 0, 0, self.parse_size, embedding_size, self.num_rels)

        self.is_tagger_trained = False
        # self.is_chunker_trained = False
        self.is_parser_trained = False

        self.tagger_name = "pos tagging"
        # self.chunker_name = "chunking"
        self.parser_name = "parsing"

        # Try to load from file
        self.tagger_path = "{}/model_pos_{}.pt".format(self.path_base, self.name)
        # self.chunker_path = "{}/model_chunk_{}.pt".format(self.path_base, self.name)
        self.parser_path = "{}/model_parse_{}.pt".format(self.path_base, self.name)

        if os.path.exists(self.tagger_path):
            self.log("Loading POS tagger")
            self.tagger = torch.load(self.tagger_path)
            self.tagger.unit.flatten_parameters()
            self.is_tagger_trained = True
            self.log("Done")

        # if os.path.exists(self.chunker_path):
        #     self.log("Loading chunker")
        #     self.chunker = torch.load(self.chunker_path)
        #     self.chunker.unit.flatten_parameters()
        #     self.is_chunker_trained = True
        #     self.log("Done")

        if os.path.exists(self.parser_path):
            self.log("Loading parser")
            self.parser = torch.load(self.parser_path)
            self.parser.unit.flatten_parameters()
            self.is_parser_trained = True
            self.log("Done")

    ##########################################################################
    def train(self, sents, num_epochs, machines):
        ######################################################################
        # Define optimizers
        tag_optimizer = optim.SGD(self.tagger.parameters(), lr=self.lrs[0])
        # chunk_optimizer = optim.SGD(self.chunker.parameters(), lr=self.lrs[1])
        chunk_optimizer = None

        # Parameters for both machines
        params = list(self.tagger.parameters()) + list(self.parser.parameters())
        parse_optimizer = optim.SGD(params, lr=self.lrs[2])
        tag_loss_function = nn.NLLLoss()
        chunk_loss_function = nn.NLLLoss()
        parse_loss_function = nn.NLLLoss()

        ######################################################################
        # Run loop
        start_time = time.time()
        for epoch in range(num_epochs):
            print("epoch #{}: ".format(epoch), end="", flush=True)
            optimizers = [tag_optimizer, chunk_optimizer, parse_optimizer]
            self.loop(sents, optimizers, [tag_loss_function, chunk_loss_function, parse_loss_function],
                      [None, None, None])
            self.decrease_lr(optimizers, epoch + 1, self.lr_decrease_factor,
                             self.epochs_per_decrease)
        print("elapsed: {} s".format(int(time.time() - start_time)))

        # Out misses
        # self.print_vectorizer_misses()

        # Save model
        torch.save(self.tagger, self.tagger_path)
        # torch.save(self.chunker, self.chunker_path)
        torch.save(self.parser, self.parser_path)

        self.log("Done")

    ##########################################################################
    def test(self, sents):
        # Collect statistics
        tag_score = TagScore(self.tags)
        # chunk_score = ChunkScore(self.chunks)
        chunk_score = None
        parse_score = ParserScore()

        num_correct_tags = 0
        num_correct_chunks = 0
        num_words = 0

        start_time = time.time()
        self.loop(sents, [None, None, None], [nn.NLLLoss(), nn.NLLLoss(), nn.NLLLoss()],
                  [tag_score, chunk_score, parse_score])
        print("elapsed: {} s".format(int(time.time() - start_time)))

        # Out statistics
        print("POS Tagging:")
        f1_s = []
        has_zero = False
        for tag in sorted(self.tags):
            stat = tag_score.stats[tag]

            if stat.num_gold_predicted == 0 or stat.num_gold == 0 or stat.num_predicted == 0:
                print("\tskipped: {:>5} ({} items)".format(tag, stat.num_gold))
                continue

            num_words += stat.num_gold
            num_correct_tags += stat.num_gold_predicted

            precision = stat.num_gold_predicted / max(stat.num_predicted, 1.0)
            recall = stat.num_gold_predicted / max(stat.num_gold, 1.0)

            f1 = 0.0
            if math.isclose(precision, 0.0) or math.isclose(recall, 0.0):
                has_zero = True
            else:
                f1 = hmean([precision, recall])

            f1_s.append(f1)

            print("\t{:>5}: P = {:4.2f}%, R = {:4.2f}%, F1 = {:4.2f}% ({} items)".format(
                tag, precision * 100, recall * 100, f1 * 100, stat.num_gold))

            # ratio = 0
            # if stat[1] != 0:
            #     ratio = stat[0] / stat[1]
            # ratio *= 100
            #
            # print("\t{:>4}: {:4} / {:4} = {:4.2f}%".format(tag, stat[0], stat[1], ratio))

        # print("Chunking:")
        # for chunk in sorted(self.chunks):
        #     stat = chunk_score.stats[chunk]
        #     num_correct_chunks += stat[0]
        #
        #     ratio = 0
        #     if stat[1] != 0:
        #         ratio = stat[0] / stat[1]
        #     ratio *= 100
        #
        #     print("\t{:>4}: {:4} / {:4} = {:4.2f}%".format(chunk, stat[0], stat[1], ratio))

        # self.print_vectorizer_misses()

        # POS aggregated
        print("Total words:", num_words)
        print("Correct POS tags:", num_correct_tags, "({:4.2f}%)".format(
            num_correct_tags / num_words * 100.0))

        average_f1 = 0.0
        if not has_zero:
            average_f1 = hmean(f1_s)
        print("Average F1 = {:4.2f}%".format(average_f1 * 100))

        # Chunks aggregated
        # print("Correct chunk tags:", num_correct_chunks, "({:4.2f}%)".format(
        #     num_correct_chunks / num_words * 100.0))
        # precision = chunk_score.num_retrieved_relevant / chunk_score.num_retrieved
        # recall = chunk_score.num_retrieved_relevant / chunk_score.num_relevant
        # f1_score = hmean([precision, recall])
        # print("\tPrecision: {:f}".format(precision))
        # print("\tRecall: {:f}".format(recall))
        # print("\tF1 score: {:f}".format(f1_score))

        print("Parsing:")
        print("\tUAS: {} from {} ({:4.2f}%)".format(parse_score.num_unlabeled_arcs, num_words,
                                                    parse_score.num_unlabeled_arcs / num_words * 100.0))
        print("\tLAS: {} from {} ({:4.2f}%)".format(
            parse_score.num_labeled_arcs, num_words,
            parse_score.num_labeled_arcs / num_words * 100.0
        ))
        print("\tMUAS: {} from {} ({:4.2f}%)".format(
            parse_score.num_modified_unlabeled_arcs, num_words,
            parse_score.num_modified_unlabeled_arcs / num_words * 100.0
        ))
        print("\tCRel: {} from {} ({:4.2f}%)".format(
            parse_score.num_labels, num_words,
            parse_score.num_labels / num_words * 100.0))
        print("\tUEM: {} from {} ({:4.2f}%)".format(
            parse_score.num_unlabeled_trees, len(sents),
            parse_score.num_unlabeled_trees / len(sents) * 100.0
        ))
        print("\tLEM: {} from {} ({:4.2f}%)".format(
            parse_score.num_labeled_trees, len(sents),
            parse_score.num_labeled_trees / len(sents) * 100.0
        ))
        print("\tMUEM: {} from {} ({:4.2f}%)".format(
            parse_score.num_modified_unlabeled_trees, len(sents),
            parse_score.num_modified_unlabeled_trees / len(sents) * 100.0
        ))

        self.log("Done")

    ##########################################################################
    # Lose control and decrease the pace
    # Warped and bewitched
    # It's time to erase
    @staticmethod
    def decrease_lr(optimizers, epoch, factor, epoch_interval):
        if epoch % epoch_interval != 0:
            return

        print("lr is multiplied by {:f}".format(factor))

        for optimizer in optimizers:
            if optimizer is not None:
                for param_group in optimizer.param_groups:
                    param_group["lr"] *= factor

    ##########################################################################
    # Used both by 'train' and 'test'.
    # The only difference is that optimizers mustn't be provided
    # during testing
    def loop(self, sents, optimizers, loss_functions, scores):
        tag_optimizer = optimizers[0]
        chunk_optimizer = optimizers[1]
        parse_optimizer = optimizers[2]

        tag_loss_function = loss_functions[0]
        chunk_loss_function = loss_functions[1]
        parse_loss_function = loss_functions[2]

        tag_scores = scores[0]
        chunk_scores = scores[1]
        parse_scores = scores[2]

        # Total number of words
        num_words = self.get_num_words(sents)
        words_per_interval = num_words // 10

        # Average losses
        tag_loss = 0
        chunk_loss = 0
        parse_loss = 0

        # Reset vectorizer
        self.vectorizer.reset_counters()

        # Reset progress bar
        print("progress [", end="", flush=True)
        current_word = 0
        next_interval = words_per_interval

        # Dump
        fout = open("result.conllu", "w")

        for sent in sents:
            # Reset optimizers and machines
            # if tag_optimizer is not None:
            #     tag_optimizer.zero_grad()
            # if chunk_optimizer is not None:
            #     chunk_optimizer.zero_grad()
            if parse_optimizer is not None:
                parse_optimizer.zero_grad()
            self.tagger.reset()
            # self.chunker.reset()

            ##############################################################
            # POS Tagger.
            # Prepare input for tagger and targets for chunker
            sequence = []
            tag_targets = []
            # chunk_targets = []
            parse_head_targets = []
            parse_rel_targets = []
            for dep, rel, head in sent:
                form, word, tag = dep
                sequence.append(self.vectorizer(form, word))
                tag_targets.append(self.tag2index[tag])
                # chunk_targets.append(self.chunk2index[chunk])
                parse_head_targets.append(head)
                parse_rel_targets.append(self.rel2index[rel])
            sequence = Variable(torch.stack(sequence, dim=0))
            tag_targets = Variable(torch.LongTensor(tag_targets))
            # chunk_targets = Variable(torch.LongTensor(chunk_targets))
            parse_head_targets = Variable(torch.LongTensor(parse_head_targets))
            parse_rel_targets = Variable(torch.LongTensor(parse_rel_targets))

            if self.is_cuda_available:
                tag_targets = tag_targets.cuda()
                # chunk_targets = chunk_targets.cuda()
                parse_head_targets = parse_head_targets.cuda()
                parse_rel_targets = parse_rel_targets.cuda()

            # Optimize tagger
            tag_output = self.tagger(sequence.view((len(sent), 1, -1)))
            current_loss = tag_loss_function(tag_output, tag_targets)
            tag_loss += current_loss.data[0]
            # if tag_optimizer is not None:
            #     current_loss.backward()
            #     tag_optimizer.step()

            ##############################################################
            # Chunking.
            # Prepare input for chunker
            sequence = Variable(sequence.data)
            probabilities = torch.exp(tag_output)
            probabilities = probabilities.mm(Variable(self.tag_embeddings))
            tagger_hidden = self.tagger.last_output.view((len(sent), -1))
            sequence = torch.cat((tagger_hidden, sequence, probabilities), dim=1)
            # sequence = Variable(torch.cat((sequence, probabilities), dim=1))
            # sequence = Variable(sequence)

            # Optimize chunker
            # chunk_output = self.chunker(sequence.view((len(sent), 1, -1)))
            # current_loss = chunk_loss_function(chunk_output, chunk_targets)
            # chunk_loss += current_loss.data[0]
            # if chunk_optimizer is not None:
            #     current_loss.backward()
            #     chunk_optimizer.step()

            # Optimize parser
            parse_output_heads, parse_output_rels = self.parser(sequence.view(len(sent), 1, -1))
            current_parser_loss = parse_loss_function(parse_output_heads, parse_head_targets)
            current_parser_loss += parse_loss_function(parse_output_rels, parse_rel_targets)
            parse_loss += current_parser_loss.data[0]

            current_loss += current_parser_loss

            if parse_optimizer is not None:
                current_loss.backward()
                parse_optimizer.step()

            ##################################################################
            # Collect stats if necessary
            actual_chunks = []
            is_sent_correct = True
            is_labeled_sent_correct = True
            heads = [0 for i in range(len(sent))]
            rels = ["" for i in range(len(sent))]
            probabilities = [[] for i in range(len(sent))]
            forms = []
            words = []
            tags = []

            if tag_scores is not None or chunk_scores is not None or parse_scores is not None:
                # Output is SEQ_LEN x NUM_TAGS
                for i in range(len(sent)):
                    forms.append(sent[i][0][0])
                    words.append(sent[i][0][1])

                    if tag_scores is not None:
                        maximum, indices = tag_output[i].max(0)
                        predicted = indices.data[0]
                        expected = tag_targets.data[i]

                        tag = sent[i][0][2]
                        stat = tag_scores.stats[tag]
                        stat.num_gold += 1

                        predicted_tag = self.index2tag[predicted]
                        tags.append(predicted_tag)

                        if predicted == expected:
                            stat.num_gold_predicted += 1

                        tag_scores.stats[predicted_tag].num_predicted += 1

                    # if chunk_scores is not None:
                    #     maximum, indices = chunk_output[i].max(0)
                    #     predicted = indices.data[0]
                    #     expected = chunk_targets.data[i]
                    #
                    #     chunk = sent[i][2]
                    #     stat = chunk_scores.stats[chunk]
                    #     stat[1] += 1
                    #
                    #     if chunk[0] == "B":
                    #         chunk_scores.num_relevant += 1
                    #
                    #     actual_chunk = self.index2chunk[predicted]
                    #     actual_chunks.append(actual_chunk)
                    #     if actual_chunk[0] == "B":
                    #         chunk_scores.num_retrieved += 1
                    #
                    #     if predicted == expected:
                    #         stat[0] += 1

                    if parse_scores is not None:
                        for j in range(len(sent) + 1):
                            probabilities[i].append((j, parse_output_heads.data[i][j]))
                        probabilities[i].sort(key=lambda pair: pair[1], reverse=True)

                        maximum, indices = parse_output_heads[i].max(0)
                        predicted = indices.data[0]
                        expected = parse_head_targets.data[i]
                        head = predicted
                        heads[i] = head
                        is_head_correct = expected == predicted
                        if is_head_correct:
                            parse_scores.num_unlabeled_arcs += 1
                        else:
                            is_sent_correct = False
                            is_labeled_sent_correct = False

                        maximum, indices = parse_output_rels[i].max(0)
                        predicted = indices.data[0]
                        expected = parse_rel_targets.data[i]
                        rel = self.index2rel[predicted]
                        rels[i] = rel
                        if expected == predicted:
                            parse_scores.num_labels += 1
                            if is_head_correct:
                                parse_scores.num_labeled_arcs += 1
                        else:
                            is_labeled_sent_correct = False

            # Specially for parser
            if parse_scores is not None:
                fout.write("#text = {}\n".format(" ".join(forms)))

                if is_sent_correct:
                    parse_scores.num_unlabeled_trees += 1
                if is_labeled_sent_correct:
                    parse_scores.num_labeled_trees += 1

                parse_output_heads = parse_output_heads.data
                # Trying to turn random graph into well-formed tree.
                # Step 1. Find a root

                outliers = set()
                roots = set()
                unvisited = set()
                maximum = parse_output_heads[0][0] - 1.0
                index = -1
                for i in range(len(sent)):
                    if parse_output_heads[i][0] > maximum:
                        maximum = parse_output_heads[i][0]
                        index = i
                    if heads[i] == 0 or rels[i] == self.root_tag:
                        roots.add(i)
                    else:
                        unvisited.add(i)
                roots.add(index)

                if len(roots) > 0:
                    options = []
                    for node in roots:
                        tmp_outliers = outliers.copy()
                        tmp_outliers.update(roots)
                        tmp_outliers.remove(node)
                        options.append(try_build_tree(node, heads, unvisited.copy(), tmp_outliers))
                    options.sort(key=lambda t: len(t[2]))
                    root, visited, outliers = options[0]
                else:
                    raise RuntimeError("unreachable branch")
                    # maximum = parse_output_heads[0][0] - 1.0
                    # index = -1
                    # for i in range(len(sent)):
                    #     if parse_output_heads[i][0] > maximum:
                    #         maximum = parse_output_heads[i][0]
                    #         index = i
                    # root = index
                    # if root in outliers:
                    #     outliers.remove(root)
                    # if root in unvisited:
                    #     unvisited.remove(root)
                    # root, visited, outliers = try_build_tree(root, heads, unvisited, outliers)
                heads[root] = 0
                rels[root] = self.root_tag

                # Now 'unvisited' contains only unresolved references.
                # Use minimal algo to resolve arcs
                while len(outliers) > 0:
                    options = []
                    for node in outliers:
                        options.append(try_expand_tree(node, probabilities, heads, visited, outliers))
                    options.sort(key=lambda t: len(t[3]))
                    index, head, visited, outliers = options[0]
                    heads[index] = head

                is_modified_sent_correct = True
                for i in range(len(sent)):
                    if heads[i] == parse_head_targets.data[i]:
                        parse_scores.num_modified_unlabeled_arcs += 1
                    else:
                        is_modified_sent_correct = False
                    fout.write("{}\t{}\t{}\t{}\t_\t_\t{}\t{}\t{}:{}\t_\n".format(
                        i + 1, forms[i], words[i], tags[i], heads[i], rels[i], heads[i], rels[i]))
                fout.write("\n")

                if is_modified_sent_correct:
                    parse_scores.num_modified_unlabeled_trees += 1


            # Specially for chunker determine the quantity of retrieved relevant chunks
            # if chunk_scores is not None:
            #     gold_chunks = [chunk for word, tag, chunk in sent]
            #     num_retrieved_relevant = 0
            #     i = 0
            #     while i < len(gold_chunks):
            #         gold_chunk = gold_chunks[i]
            #         actual_chunk = actual_chunks[i]
            #
            #         if gold_chunk[0] == "B":
            #             is_correct = True
            #             while True:
            #                 if gold_chunk != actual_chunk:
            #                     is_correct = False
            #
            #                 i += 1
            #                 if i == len(gold_chunks):
            #                     break
            #
            #                 gold_chunk = gold_chunks[i]
            #                 actual_chunk = actual_chunks[i]
            #
            #                 if gold_chunk[0] != "I":
            #                     if actual_chunk[0] == "I":
            #                         is_correct = False
            #                     break
            #
            #             if is_correct:
            #                 num_retrieved_relevant += 1
            #         else:
            #             i += 1
            #     chunk_scores.num_retrieved_relevant += num_retrieved_relevant

            # Emulate progress bar
            current_word += len(sent)
            if current_word >= next_interval:
                next_interval += words_per_interval
                print('💪', end="", flush=True)

        # Debug epoch log
        print("], ATL: {:10.8f}, ACL: {:10.8f}, APL: {:10.8f}".format(
            tag_loss / len(sents), chunk_loss / len(sents), parse_loss / len(sents)))
        fout.close()

    ##########################################################################
    def print_vectorizer_misses(self):
        print("unknown words: {} from {} ({:4.2f}%)".format(
            self.vectorizer.num_word_misses, self.vectorizer.num_words,
            self.vectorizer.num_word_misses / self.vectorizer.num_words * 100.0
        ))
        print("unknown grams: {} from {} ({:4.2f}%)".format(
            self.vectorizer.num_char_misses, self.vectorizer.num_grams,
            self.vectorizer.num_char_misses / self.vectorizer.num_grams * 100.0
        ))

    ##########################################################################
    @staticmethod
    def get_num_words(sents):
        num_words = 0
        for sent in sents:
            num_words += len(sent)
        return num_words

    ##########################################################################
    def create_or_load_indices(self):
        tag_path = "{}/{}_tags.txt".format(self.path_base, self.name)
        chunk_path = "{}/{}_chunks.txt".format(self.path_base, self.name)
        rel_path = "{}/{}_rels.txt".format(self.path_base, self.name)

        create_tag_index = False
        create_chunk_index = False
        create_rel_index = False

        # Try load
        if os.path.exists(tag_path):
            # Load from existing data base
            self.log("Loading POS tag index from file")

            for line in open(tag_path):
                tag, index = line.split()
                index = int(index)
                self.tags.add(tag)
                self.tag2index[tag] = index
                self.index2tag[index] = tag

            self.num_tags = len(self.tags)
        else:
            # Create from scratch
            self.log("Creating POS tag index")
            create_tag_index = True

        # Try load
        if os.path.exists(chunk_path):
            # Load chunk index from file
            self.log("Loading chunk index from file")

            for line in open(chunk_path):
                chunk, index = line.split()
                index = int(index)
                self.chunks.add(chunk)
                self.chunk2index[chunk] = index
                self.index2chunk[index] = chunk

            self.num_chunks = len(self.chunks)
        else:
            # Create from scratch
            self.log("Creating chunk tag index")
            create_chunk_index = True

        # Try load
        if os.path.exists(rel_path):
            # Load rel index from file
            self.log("Loading rel index from file")

            for line in open(rel_path):
                rel, index = line.split()
                index = int(index)
                self.rels.add(rel)
                self.rel2index[rel] = index
                self.index2rel[index] = rel

            self.num_rels = len(self.rels)
        else:
            # Create from scratch
            self.log("Creating rel tag index")
            create_rel_index = True

        # Create if necessary
        if create_tag_index or create_chunk_index or create_rel_index:
            # Collect data
            for sent in self.sents:
                self.num_words += len(sent)
                for dep, rel, head in sent:
                    form, word, tag = dep
                    if create_tag_index:
                        self.tags.add(tag)
                    if create_rel_index:
                        self.rels.add(rel)
                    # if create_chunk_index:
                    #     self.chunks.add(chunk)

            # Create POS tag database
            if create_tag_index:
                file_tags = open(tag_path, "w")
                self.num_tags = len(self.tags)
                for index, tag in enumerate(self.tags):
                    self.index2tag[index] = tag
                    self.tag2index[tag] = index
                    file_tags.write("{} {}\n".format(tag, index))
                file_tags.close()

            # Create chunk tag database
            if create_chunk_index:
                file_chunks = open(chunk_path, "w")
                self.num_chunks = len(self.chunks)
                for index, chunk in enumerate(self.chunks):
                    self.index2chunk[index] = chunk
                    self.chunk2index[chunk] = index
                    file_chunks.write("{} {}\n".format(chunk, index))
                file_chunks.close()

            # Create rel tag database
            if create_rel_index:
                file_rels = open(rel_path, "w")
                self.num_rels = len(self.rels)
                for index, rel in enumerate(self.rels):
                    self.index2rel[index] = rel
                    self.rel2index[rel] = index
                    file_rels.write("{} {}\n".format(rel, index))
                file_rels.close()

    ##########################################################################
    def log(self, message):
        print("Model [{}]:".format(self.name), message)
Esempio n. 33
0
from data_analysis import DataManager
from vectorizer import Vectorizer
import numpy as np
import pickle
from tempfile import TemporaryFile

dm = DataManager('./data/spam.csv')
dm.most_frequent_character_in_spam()
dm.most_frequent_character_in_legit()
dm.most_frequent_characters()
dm.average_text_length()

sentences, labels = dm.get_text(), dm.get_labels()
labels = list(map(lambda v: 0 if v == 'ham' else 1, labels))
vectorizer = Vectorizer(sentences)

sentences_features = []

for sentence in sentences:
    sentence_vector = vectorizer.text_to_vec(sentence, alpha=0.3)
    sentences_features.append(sentence_vector)

train_x, train_y = sentences_features[0:5000], labels[0:5000]
train_x = np.asarray(train_x)
train_y = np.asarray(train_y)

test_x, test_y = sentences_features[5000:], labels[5000:]
test_x = np.asarray(test_x)
test_y = np.asarray(test_y)

np.savetxt('train_x.txt', train_x)
def test_with_nested_CV(folder='model',folds=5, plot=True, steps=['hashing','tfidf']):
    '''
    
    Evaluates the classifer by doing nested CV 
    i.e. keeping 1/folds of the data out of the training and doing training 
    (including model selection for regularizer) on the training set and testing
    on the held-out data
    
    Also prints some stats and figures
    
    INPUT
    folder  folder with model files
    folds   number of folds

    '''
    # start timer
    import time
    t0 = time.time()
    # create bag of words representations
    vv = Vectorizer(steps=steps)

    # load data
    vec = Vectorizer(folder=folder)
    data = get_speech_text(folder=folder)
    for key in data.keys():
        data[key] = vec.transform(data[key])
    # create numerical labels
    Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data))))
    # create data matrix
    X = vstack(data.values())
    # permute data 
    fsize = len(Y)/folds
    randidx = permutation(len(Y))
    Y = Y[randidx]
    X = X[randidx,:]
    idx = reshape(arange(fsize*folds),(folds,fsize))
    Y = Y[:fsize*folds]
    # allocate matrices for predictions
    predicted = zeros(fsize*folds)
    predicted_prob = zeros((fsize*folds,len(data)))
        
    # the regularization parameters to choose from 
    parameters = {'C': (10.**arange(-4,4,1.)).tolist()}
    
    # do nested CV
    for ifold in range(folds):
        testidx = idx[ifold,:]
        trainidx = idx[setdiff1d(arange(folds),ifold),:].flatten()
        text_clf = LogisticRegression(class_weight='auto',dual=True)
        # for nested CV, do folds-1 CV for parameter optimization
        # within inner CV loop and use the outer testfold as held-out data
        # for model validation
        gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=(folds-1))
        gs_clf.fit(X[trainidx,:],Y[trainidx])
        predicted[testidx] = gs_clf.predict(X[testidx,:])
        predicted_prob[testidx,:] = gs_clf.predict_proba(X[testidx,:])
        print '************ Fold %d *************'%(ifold+1)
        print metrics.classification_report(Y[testidx], predicted[testidx],target_names=data.keys()) 
    
    t1 = time.time()
    total_time = t1 - t0
    timestr = 'Wallclock time: %f sec\n'%total_time
    dimstr = 'Vocabulary size: %d\n'%X.shape[-1]
    report = timestr + dimstr
    # extract some metrics
    print '********************************'
    print '************ Total *************'
    print '********************************'
    report += metrics.classification_report(Y, predicted,target_names=data.keys())
    # dump metrics to file
    open(folder+'/report_%s.txt'%'_'.join(sorted(steps)),'wb').write(report)
    print(report)
    conf_mat = metrics.confusion_matrix(Y,predicted)
    open(folder+'/conf_mat_%s.txt'%'_'.join(sorted(steps)),'wb').write(json.dumps(conf_mat.tolist()))
    print(conf_mat)
    
    if plot:
        # print confusion matrix
        import pylab
        pylab.figure(figsize=(16,16))
        pylab.imshow(metrics.confusion_matrix(Y,predicted),interpolation='nearest')
        pylab.colorbar()
        pylab.xticks(arange(4),[x.decode('utf-8') for x in data.keys()])
        pylab.yticks(arange(4),[x.decode('utf-8') for x in data.keys()])
        pylab.xlabel('Predicted')
        pylab.ylabel('True')
        font = {'family' : 'normal', 'size'   : 30}
        pylab.rc('font', **font)
        pylab.savefig(folder+'/conf_mat.pdf',bbox_inches='tight')
word_embeddings_file_path = args.word2vec
pretrained_weights_file_path = args.save
epochs = args.epochs
df = read_SEMEVAL_data(args.data)

# initialize objects
print('Initializing objects ...')
print('Initializing word embeddings ...')
t1 = time.time()
word_embeddings = WordEmbeddings(word_embeddings_file_path)
t2 = time.time()
print('\tTook %f seconds' % (t2 - t1))
print('Initializing tokenizer ...')
tokenizer = Tokenizer()
print('Initializing vectorizer ...')
vectorizer = Vectorizer(word_embeddings, tokenizer)

#### training dataset ####
# vectorizing
ids, train_a_vectors, train_b_vectors, train_gold = vectorizer.vectorize_df(df)
train_max_a_length = len(max(train_a_vectors, key=len))
train_max_b_length = len(max(train_b_vectors, key=len))
print('maximum number of tokens per sentence A in training set is %d' %
      train_max_a_length)
print('maximum number of tokens per sentence B in training set is %d' %
      train_max_b_length)
max_len = max([train_max_a_length, train_max_b_length])

# padding
train_a_vectors = pad_tensor(train_a_vectors, max_len)
train_b_vectors = pad_tensor(train_b_vectors, max_len)
Esempio n. 36
0
        if pred == vec.end_tag:
            break
        else:
            res += pred

        # next_hidden = sess.run(tensors['next_hidden'], feed_dict=feed_dict)
        # initial_state = np.vstack((initial_state, next_hidden))[1:]

    return res


if __name__ == '__main__':
    print 'Loading data...'
    with open('../../data/smalldata.txt', 'r') as f:
        data = [line.strip() for line in f]
    vectorizer = Vectorizer(seq_length=25)
    print 'Fitting Vectorizer...'
    X_data, y_data = vectorizer.fit_transform(data)

    with open('vectorizer.pkl', 'w') as f:
        pickle.dump(vectorizer, f)

    N, seq_length, input_dim = X_data.shape
    hidden_dim = 128
    output_dim = input_dim

    X = tf.placeholder(tf.float32, [None, seq_length, input_dim], 'X')
    y = tf.placeholder(tf.float32, [None, output_dim], 'y')
    initial_state = tf.placeholder(tf.float32, [None, 2 * hidden_dim], 'initial_state')
    
    lstm, next_hidden = lstm_layer(X, input_dim, seq_length, hidden_dim, 
Esempio n. 37
0
class Trainer(object):
    """Trains the classifier with training data and does the cross validation.
    """

    def __init__(self):
        """Initializes the datastructures required.
        """
        # The actual text extraction object (does text to vector mapping).
        self.vectorizer = Vectorizer()

        # A list of already hand classified tweets to train our classifier.
        self.data = None

        # A list containing the classification to each individual tweet
        # in the tweets list.
        self.classification = None

        self.classifier = None
        self.scores = None

    def initialize_training_data(self):
        """Initializes all types of training data we have.
        """
        corpus_file = open(os.path.join(datasettings.DATA_DIRECTORY,
                                        'full-corpus.csv'))

        classification, tweets = parse_training_corpus(corpus_file)

        reviews_positive = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'positive'))

        num_postive_reviews = len(reviews_positive)
        class_positive = ['positive'] * num_postive_reviews

        reviews_negative = parse_imdb_corpus(
            os.path.join(datasettings.DATA_DIRECTORY, 'negative'))
        num_negative_reviews = len(reviews_negative)
        class_negative = ['negative'] * num_negative_reviews

        self.data = tweets
        self.classification = classification

        #self.date_time = date_time
        #self.retweet = retweets
        #self.favorited = favorited

    def initial_fit(self):
        """Initializes the vectorizer by doing a fit and then a transform.
        """
        # We map the sentiments to the values specified in the SENTIMENT_MAP.
        # For any sentiment that is not part of the map we give a value 0.
        classification_vector = numpy.array(map(
            lambda s: SENTIMENT_MAP.get(s.lower(), 0),
                                        self.classification))

        feature_vector = self.vectorizer.fit_transform(self.data)

        return (classification_vector, feature_vector)

    def build_word_dict(self):
        """ Build sentiment dictionary and build vector of 
            weights for tweets.
        """
        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-96.txt'))
        wordDict = {}
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        fileIn = open(os.path.join(datasettings.DATA_DIRECTORY,
                                   'AFINN-111.txt'))
        line = fileIn.readline()
        while line != '':
            temp = string.split(line, '\t')
            wordDict[temp[0]] = int(temp[1])
            line = fileIn.readline()
        fileIn.close()

        word_dict_vector = []
        for tweet in self.data:
            word_list = tweet.split()
            sum = 0
            for word in word_list:
                if word in wordDict.keys():
                    sum += wordDict[word]    
            word_dict_vector.append(sum)

        return word_dict_vector

    def transform(self, test_data):
        """Performs the transform using the already initialized vectorizer.
        """
        feature_vector = self.vectorizer.transform(test_data)

    def score_func(self, true, predicted):
        """Score function for the validation.
        """
        return metrics.precision_recall_fscore_support(
            true, predicted,
            pos_label=[
                SENTIMENT_MAP['positive'],
                SENTIMENT_MAP['negative'],
                SENTIMENT_MAP['neutral'],
                ],
            average='macro')

    def cross_validate(self, k=10):
        """Performs a k-fold cross validation of our training data.

        Args:
            k: The number of folds for cross validation.
        """
        self.scores = []

        X, y = check_arrays(self.feature_vector,
                            self.classification_vector,
                            sparse_format='csr')
        cv = cross_validation.check_cv(
            k, self.feature_vector, self.classification_vector,
            classifier=True)

        for train, test in cv:
            self.classifier1.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier2.fit(self.feature_vector[train],
                          self.classification_vector[train])
            self.classifier3.fit(self.feature_vector[train],
                          self.classification_vector[train])
            classification1 = self.classifier1.predict(
                self.feature_vector[test])
            classification2 = self.classifier2.predict(
                self.feature_vector[test])
            classification3 = self.classifier3.predict(
                self.feature_vector[test])

            classification = []
            for predictions in zip(classification1, classification2,
                                   classification3):
                neutral_count = predictions.count(0)
                positive_count = predictions.count(1)
                negative_count = predictions.count(-1)
                if (neutral_count == negative_count and
                    negative_count == positive_count):
                    classification.append(predictions[0])
                elif (neutral_count > positive_count and
                    neutral_count > negative_count):
                    classification.append(0)
                elif (positive_count > neutral_count and
                    positive_count > negative_count):
                    classification.append(1)
                elif (negative_count > neutral_count and
                    negative_count > positive_count):
                    classification.append(-1)
            classification = numpy.array(classification)

            self.scores.append(self.score_func(y[test], classification))

    def train_and_validate(self, cross_validate=False, mean=False,
                           serialize=False):
        """Trains the SVC with the training data and validates with the test data.

        We do a K-Fold cross validation with K = 10.
        """
        self.classification_vector, self.feature_vector = self.initial_fit()

        self.classifier1 = naive_bayes.MultinomialNB()
        self.classifier2 = naive_bayes.BernoulliNB()
        self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1',
                                         C=1000,dual=False, tol=1e-3)

        if cross_validate:
            self.cross_validate(k=cross_validate)
        else:
            self.classifier1.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier2.fit(self.feature_vector,
                                 self.classification_vector)
            self.classifier3.fit(self.feature_vector,
                                 self.classification_vector)

        if serialize:
            classifiers_file = open(os.path.join(
                datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb')
            cPickle.dump([self.classifier1,
                          self.classifier2,
                          self.classifier3], classifiers_file)
            vectorizer_file = open(os.path.join(
                datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb')
            cPickle.dump(self.vectorizer, vectorizer_file)

        return self.scores

    def build_ui(self, mean=False):
        """Prints out all the scores calculated.
        """
        for i, score in enumerate(self.scores):
            print "Cross Validation: %d" % (i + 1)
            print "*" * 40
            if mean:
                print "Mean Accuracy: %f" % (score)
            else:
                print "Precision\tRecall\t\tF-Score"
                print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~"
                precision = score[0]
                recall = score[1]
                f_score = score[2]
                print "%f\t%f\t%f" % (precision, recall, f_score)


            print
class Classifier:

    def __init__(self,folder='model',train=False):
        '''
        Creates a classifier object
        if no model is found, or train is set True, a new classifier is learned

        INPUT
        folder  the root folder with the Bag-of-Word data, where the model is stored
        train   set True if you want to train 

        '''
        self.folder = folder
        # load Bag-of-Word extractor
        self.bow_vectorizer = Vectorizer(self.folder)
        # if there is no classifier file or training is invoked
        if (not os.path.isfile(self.folder+'/classifier.pickle')) or train:
            print 'Training classifier'
            self.train()
        print 'Loading classifier'
        clfdict = cPickle.load(open(self.folder+'/classifier.pickle'))
        self.clf = clfdict['classifier']
        self.parties = clfdict['labels']

    def predict(self,text):
        '''
        Loads scikit-learn Bag-of-Word extractor and classifier and
        applies it to some text. 

        INPUT
        text    a string to assign to a party
        folder  the folder containing the classifier and bag-of-words transformer pickles
        
        '''

        # transform string into sparse matrix
        x = self.bow(text)
        # predict probabilities of each party
        probabilities = self.clf.predict_proba(x)
        # transform the predictions into json output
        result = {'text':text,'prediction':[]}
        # the classifier returns parties in alphabetical order, so we reorder
        for pidx in range(len(self.parties)): 
            result['prediction'].append(
                {   'party':self.parties[pidx],
                    'probability':probabilities.flatten()[pidx]
                })
        return result

    def bow(self,text):
        if type(text) is not list:
            text = [text]
        return self.bow_vectorizer.transform(text)
   
    def train(self,folds = 2):
        '''
        trains a classifier on the bag of word vectors extracted with extract_bundestag speeches.py

        INPUT
        folder  the folder to store the model file and load the bag-of-words-vectorizer file
        folds   number of cross-validation folds for optimizing the regularizer of the classifier

        '''
        try:
            # load the data
            data = get_speech_text(folder=self.folder)
            for key in data:
                data[key] = self.bow(data[key])
        except:
            print('Could not load text data file in\n' + \
                  'Try executing [python downloader.py --download --parse]')
            raise
        # create numerical labels for each party
        Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data))))
        # create the data matrix
        X = vstack(data.values())
        # estimate fold size (if not a divisor of total samples)
        fsize = len(Y)/folds
        # permute data indices for training
        randidx = permutation(len(Y))
        Y = Y[randidx]
        X = X[randidx,:]
        # the classifier, accounting for unbalanced classes
        text_clf = LogisticRegression(class_weight='auto',dual=True)
        # the regularizer
        parameters = {'C': (10.**arange(-5,5,1.)).tolist()}
        # perform gridsearch to get the best regularizer
        gs_clf = GridSearchCV(text_clf, parameters, cv=folds, n_jobs=-1,verbose=2)
        gs_clf.fit(X,Y)
        print "Classifier reached mean %0.2f accuracy with regularizer: %f"%(gs_clf.best_score_, gs_clf.best_params_['C'])
        # dump classifier to pickle
        cPickle.dump({'classifier':gs_clf,'labels':data.keys()},open(self.folder+'/classifier.pickle','wb'),-1)