Ejemplo n.º 1
0
def filing_to_labeled_sentences(filing_text):
    clean_text = clean_filing(filing_text)
    raw_sentences = tokenizer.tokenize(clean_text.strip())    
    sentences = []
    
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(doc2vec.LabeledLineSentence(filing_to_wordlist(raw_sentence)))
                
    return sentences
Ejemplo n.º 2
0
def get_clean_filing_text(cik, ticker='', year_start=2000, year_end=2015):
    conn = psycopg2.connect(settings.CONN_STRING)
    cur = conn.cursor()
    cur.execute(
        'select f.cik, f.date_filed, f.form_type, f.file_name from filing_index f where f.cik = %s;',
        (cik, ))

    for record in cur:
        date_filed = record[1]
        form_type = record[2]
        file_name = record[3]

        if (form_type in settings.FORM_TYPES and
            (date_filed.year >= year_start and date_filed.year <= year_end)):

            # figure out filename
            filename = '{}_{}_{}_{}.txt'.format(ticker, cik, form_type,
                                                str(date_filed))
            filename_clean = '{}_{}_{}_{}_clean.txt'.format(
                ticker, cik, form_type, str(date_filed))

            # check if clean file exists - if not create it
            if os.path.isfile(settings.BASE_PATH_FILINGS + filename_clean):
                print('Filing {} already exists. Skipping.'.format(
                    filename_clean))
            else:
                # read file and clean it
                f_clean = open(settings.BASE_PATH_FILINGS + filename_clean,
                               'w')
                try:
                    print('Cleaning file {} and writing to {}'.format(
                        filename, filename_clean))
                    f = open(settings.BASE_PATH_FILINGS + filename, 'r')
                    filing_text = f.read()
                    f.close()
                    clean_text = clean_filing(filing_text)
                    f_clean.write(clean_text)
                    f_clean.close()
                except:
                    print('Character mapping error. Skipping...')
                    f_clean.close()
                    continue

            # train model
            print("Training model with {} filing for {} for date {}...".format(
                form_type, ticker, date_filed))
            sentences = doc2vec.LabeledLineSentence(
                settings.BASE_PATH_FILINGS + filename_clean)
            model.build_vocab(sentences)

            model.train(sentences)
    cur.close()
    conn.close()
Ejemplo n.º 3
0
if __name__ == '__main__':
	train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', 'iitb.tsv'), header=0, delimiter="\t", quoting=3 )
	print "Read %d labeled train reviews\n" % (train["review"].size)
	
	logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
	logging.info("running %s" % " ".join(sys.argv))
	
	num_features = 800    # Word vector dimensionality
	min_word_count = 1   # Minimum word count
	num_workers = 8       # Number of threads to run in parallel
	context =8         # Context window size
	downsampling = 1e-4   # Downsample setting for frequent words
	
	input_file = 'data/hindi_review_movie.txt'
	sentences = doc2vec.LabeledLineSentence(input_file)	
	print "Training Word2Vec model..."
	model = doc2vec.Doc2Vec(sentences, workers=num_workers,size=num_features, min_count = min_word_count, window = context)
	model.init_sims(replace=True)

	#model_name = "300features_40minwords_10context"
	#model.save(model_name)

	f=open('data/hindi_review_movie.txt')
	corpus=[]
	for line in f:
		line= line.strip()
		corpus.append(line)
	decoded = [x.decode(chardet.detect(x)['encoding']) for x in (corpus)]

	vectorizer = TfidfVectorizer(tokenizer=tokenize,use_idf=True,max_df=0.3,min_df=0.001,strip_accents='unicode')
Ejemplo n.º 4
0
from gensim.models import word2vec, doc2vec
from gensim import models
import logging, sys

logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))
sentences = doc2vec.LabeledLineSentence('hindi2')
model = doc2vec.Doc2Vec(sentences, size=100, window=8, min_count=5, workers=8)
#model.save("hindi_doc2vec")
#model=doc2vec.Doc2Vec.load("hindi_doc2vec")
print model["SENT_0"]
Ejemplo n.º 5
0
from gensim.models import word2vec, doc2vec
from word2vec import Sent2Vec, LineSentence
import logging
import sys
import os
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

#sentences = word2vec.Text8Corpus('english')
sentences = doc2vec.LabeledLineSentence('english')
model = doc2vec.Doc2Vec(sentences,workers=8,size=500,min_count=5,window=8)
#model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
model.save('amazon_mp3_review.model')
print model.most_similar("SENT_0")