Ejemplo n.º 1
0
def train_doc2vec_curia(min_count, epoch_num, embedding_dim, learning_rate):
    print("Initializing database and loading documents...")
    docs = table_docs.get_docs_with_names(['Judgment'])

    helpers.create_folder_if_not_exists('trained_models')
    model_path = os.path.join('trained_models',
                              helpers.setup_json['doc2vec_path'])

    content_gen = ContentGenerator(docs)
    contents = [list(chain.from_iterable(content)) for content in content_gen]
    contents = [
        gensim.models.doc2vec.TaggedDocument(content, [i])
        for i, content in enumerate(contents)
    ]

    print('Initializing and training model...')
    model = gensim.models.Doc2Vec(documents=contents,
                                  iter=epoch_num,
                                  size=embedding_dim,
                                  window=3,
                                  dm=1,
                                  min_count=min_count,
                                  negative=5,
                                  workers=4,
                                  alpha=learning_rate)

    # save final version
    model.save(model_path)

    print('Saving document embeddings...')
    save_doc_embeddings_doc2vec('doc2vec.pickle', model)
Ejemplo n.º 2
0
def train_word2vec_curia(min_count, epoch_num, embedding_dim, learning_rate):
    print("Initializing database and loading documents...")
    docs = table_docs.get_docs_with_names(['Judgment'])

    helpers.create_folder_if_not_exists('trained_models')
    model_path = os.path.join('trained_models',
                              helpers.setup_json['word2vec_path'])

    content_gen = chain.from_iterable(ContentGenerator(docs))
    contents = list(content_gen)  # generate all contents at once

    print('Initializing and training model...')
    model = gensim.models.Word2Vec(sentences=contents,
                                   iter=epoch_num,
                                   size=embedding_dim,
                                   window=3,
                                   sg=1,
                                   min_count=min_count,
                                   negative=5,
                                   workers=4,
                                   alpha=learning_rate)

    # save final version
    model.wv.save_word2vec_format(model_path, binary=True)

    print('Saving document embeddings...')
    save_doc_embeddings_word2vec('word2vec.pickle', model)
Ejemplo n.º 3
0
def train_lsi_curia(embedding_dim):
    print("Initializing database and loading documents...")
    docs = table_docs.get_docs_with_names(['Judgment'])

    helpers.create_folder_if_not_exists('trained_models')
    model_path = os.path.join('trained_models', helpers.setup_json['lsi_path'])

    content_gen = ContentGenerator(docs)
    contents = [list(chain.from_iterable(content)) for content in content_gen]

    print('Initializing and training model...')
    dictionary = gensim.corpora.Dictionary(contents)
    content_bow = [dictionary.doc2bow(content) for content in contents]
    tfidf = gensim.models.TfidfModel(content_bow)
    content_tfidf = tfidf[content_bow]

    model = gensim.models.LsiModel(content_tfidf,
                                   id2word=dictionary,
                                   num_topics=embedding_dim)

    # save final version
    dictionary.save(os.path.splitext(model_path)[0] + '_dict.bin')
    tfidf.save(os.path.splitext(model_path)[0] + '_tfidf.bin')
    model.save(model_path)

    print('Saving document embeddings...')
    save_doc_embeddings_lsi('lsi.pickle', model, dictionary, tfidf)
Ejemplo n.º 4
0
def save_doc_embeddings_word2vec(file_name, model):
    """Saves document embeddings in a file
    using the provided word2vec or fasttext model.
    """
    docs = table_docs.get_docs_with_names(['Judgment'])
    content_gen = ContentGenerator(docs)

    helpers.create_folder_if_not_exists('saved_embeddings')
    embs = []

    for doc, content in zip(docs, content_gen):
        emb = get_embedding_doc_word2vec(content, model, stopword_removal=True)
        embs.append({'doc_id': doc['id'], 'emb': emb})

    with open(os.path.join('saved_embeddings', file_name), 'wb') as f:
        pickle.dump(embs, f)
Ejemplo n.º 5
0
def save_doc_embeddings_lsi(file_name, model, dictionary, tfidf):
    """Saves document embeddings in a file
    using the provided lsi model.
    """
    docs = table_docs.get_docs_with_names(['Judgment'])
    content_gen = ContentGenerator(docs)
    contents = [list(chain.from_iterable(content)) for content in content_gen]

    helpers.create_folder_if_not_exists('saved_embeddings')
    embs = []

    for doc, content in zip(docs, contents):
        emb = get_embedding_doc_lsi(content, model, dictionary, tfidf)
        embs.append({'doc_id': doc['id'], 'emb': emb})

    with open(os.path.join('saved_embeddings', file_name), 'wb') as f:
        pickle.dump(embs, f)
Ejemplo n.º 6
0
import argparse
from itertools import chain
from lazylawyer.content_generator import ContentGenerator
from lazylawyer.database import table_cases, table_docs, table_doc_contents
from lazylawyer import helpers
import numpy as np
import os
import pickle
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer

# set general model path
helpers.create_folder_if_not_exists('trained_models')
model_path = os.path.join('trained_models',
                          helpers.setup_json['subject_classifier_path'])


def _identity_func(x):
    return x


def _multiply_func(x):
    return x * 1000


def generate_class_labels():
    labels_path = os.path.join(
        'trained_models', helpers.setup_json['subject_classifier_labels_path'])
Ejemplo n.º 7
0
from lazylawyer import helpers
import os
from pathlib import Path
import requests


def download_doc_for_case(case, doc):
    """Downloads document from the web belonging to a
    specific case. Stores the document
    under [name].[format].
    """
    folder_path = Path('doc_dir/' + helpers.case_name_to_folder(case['name']))
    helpers.create_folder_if_not_exists(folder_path)

    doc_filename = str(doc['id']) + '.' + doc['format']
    if doc['link'] is not None:
        helpers.download_file(doc['link'], folder_path / doc_filename)