Esempio n. 1
0
    def __init__(self, languge):

        self.settings = Settings()

        self.language = languge
        self.text_processor = TextProcessor(self.language)
        self.signalwords = self.settings.signal_words_ru
Esempio n. 2
0
 def process(self, text, doc_id):
     '''
     calls on the TextProcessor class for text processing and tokenization of documents.
     each url is designated with unique id number.
     id is increased at the end for new url.
     returns updated index.
     '''
     processor = TextProcessor(text, self.index, doc_id)
     processor.process_text()
     return processor.get_index()
Esempio n. 3
0
class InvertedIndex:
    def __init__(self, fpath, dump_fpath):
        self.text_processor = TextProcessor()

        self.queries = []
        self.documents = []
        self.original_documents = []
        self.is_duplicates = []

        self.vectorizer = CountVectorizer()
        self.build_index(fpath)

        self.dump(dump_fpath)

    def build_index(self, fpath):
        with open(abspath(fpath), 'r', encoding='utf-8') as file:
            table = csv_reader(file)

            for row in tqdm(list(table)):
                if row[0] == '':
                    continue
                # TODO: сделать полный индекс
                if row[0] == '10000':
                    break

                self.queries.append(self.text_processor.process(row[1]))
                self.documents.append(self.text_processor.process(row[2]))

                self.original_documents.append(row[2])
                self.is_duplicates.append(row[3])

        self.vectorizer.fit_transform(self.queries)

    def dump(self, dump_fpath):
        json_encoded = jp_encode(self)

        with open(dump_fpath, 'w', encoding='utf-8') as file:
            json_dump(json_encoded, file, ensure_ascii=False, indent=4)

    @staticmethod
    def restore(dump_fpath):
        with open(dump_fpath, "r", encoding='utf-8') as file:
            idx_dump = json_load(file)
        return jp_decode(idx_dump)

    @staticmethod
    def from_dump_or_build(dump_fpath, corpora_fpath):
        if isfile(dump_fpath):
            try:
                return InvertedIndex.restore(dump_fpath)
            except Exception:
                return InvertedIndex(corpora_fpath, dump_fpath)
        else:
            return InvertedIndex(corpora_fpath, dump_fpath)
Esempio n. 4
0
    def __init__(self, fpath, dump_fpath):
        self.text_processor = TextProcessor()

        self.queries = []
        self.documents = []
        self.original_documents = []
        self.is_duplicates = []

        self.vectorizer = CountVectorizer()
        self.build_index(fpath)

        self.dump(dump_fpath)
Esempio n. 5
0
class PtBrTwitter():


    def __init__(self, dir_in, dir_out):
        self.dir_in = dir_in
        self.dir_out = dir_out
        self.tw_files = ([file for root, dirs, files in os.walk(self.dir_in)
            for file in files if file.endswith('.json') ])
        self.doc_list = list()
        self.date_list = list()
        self.tp = TextProcessor()

    def read(self):
        
        for tw_file in self.tw_files:
            with open(self.dir_in+tw_file) as data_file:
                for line in data_file:
                    tweet = json.loads(line)
                    self.doc_list.append(tweet['text'])
                    self.date_list.append(tweet['created_at'])
    def tokenizeAndSave(self, file_name):
        tweets = self.tp.text_process(self.doc_list)
        tweets = list(itertools.chain.from_iterable(tweets))
        t_count = Counter(tweets)
        with open(self.dir_out+file_name, 'wb') as handle:
            pickle.dump(t_count, handle)

    def loadCounter(self, file_name):
        with open(self.dir_out+file_name, 'rb') as handle:
            t_count = pickle.load(handle)
        return t_count
    def __init__(self, languge):

        self.settings = Settings()

        self.language = languge
        self.text_processor = TextProcessor(self.language)
        self.signalwords = self.settings.signal_words_ru
Esempio n. 7
0
    def GenerateLocalizableFile(self):
        """Generates a localizable representation of the article.

    This method grabs the English version of the article, runs the text through
    `TextProcessor`, and writes the result to `localizableFilePath`.

    Returns:
      An absolute path to the article's localizable representation.

    Raises:
      ArticleException: If no English version of an article is available.
    """
        if not 'en' in self.locales:
            error = """
ArticleException:
- path: %s
- locales: %s
- No English edition found.
"""
            raise ArticleException(error % (self.path, self.locales))

        original = self.GetOriginalFilePath('en')
        with codecs.open(original, 'r', 'UTF-8') as infile:
            with codecs.open(self.localizable_file_path, 'w',
                             'UTF-8') as output:
                temp = TextProcessor(django=infile.read())
                output.write(temp.html)
        return self.localizable_file_path
Esempio n. 8
0
    def run(self):
        logger.info('Creating text processor')
        text_processor = TextProcessor()

        for file in self.input().keys():
            logger.info('Reading %s file: "%s"', file, self.input()[file].path)
            df = pd.read_csv(self.input()[file].path)

            logger.info('Its %s lines', df.shape[0])
            logger.info('Start processing %s...', file)

            df.name = df.name.map(lambda x: text_processor.process_text(x, lang='ru'))
            df.name = df.name.map(lambda x: ' '.join(x))

            logger.info('Processing of %s succeed, writing it to "%s"', file, self.output()[file].path)

            df.to_csv(self.output()[file].path)
Esempio n. 9
0
 def __init__(self, dir_in, dir_out):
     self.dir_in = dir_in
     self.dir_out = dir_out
     self.tw_files = ([file for root, dirs, files in os.walk(self.dir_in)
         for file in files if file.endswith('.json') ])
     self.doc_list = list()
     self.date_list = list()
     self.tp = TextProcessor()
Esempio n. 10
0
  def __init__(self, text = ''):
    # prepare to use nltk_data
    path_to_nltk_data = os.path.join(
      os.path.dirname(os.path.abspath(__file__)),
      'nltk_data'
    )
    nltk.data.path.append(path_to_nltk_data)

    # pre-processing
    tp = TextProcessor(text)
    self.text = tp.getProcessedText()

    # setup dictionaries
    corpus = Corpus()
    words = {}
    words['positive'] = corpus.positiveWordDict()
    words['negative'] = corpus.negativeWordDict()
    self.dictionaries = words
    def get_article(self, url):

        # инициализируем UrlHandler
        urlhandler = UrlHandler()
        # получаем веб-страницу и её кодировку
        source_page, encoding = urlhandler.load_page(url)

        # инициализируем парсер, текстовый процессор, экстрактор
        html_parser = Parser(source_page, encoding)
        text_processor = TextProcessor(self.language)
        article_extractor = ArticleExtractor(self.language)
        formatter = Formatter()

        # получаем списки элементов, очищенных от тегов (raw_cleaned_elements)
        # и нет (elements_as_string)
        raw_cleaned_elements, elements_as_string = html_parser.get_parsed_nodes(
        )
        # заголовок
        title = html_parser.get_title()

        # получаем спосок лемматизированных текстов
        stemmed_tag_elements = text_processor.iterate_over_texts(
            raw_cleaned_elements)
        # получаем ранжированный список элементов
        best_nodes = article_extractor.find_best_node(stemmed_tag_elements)

        # для первого элемента из ранжированного списка
        # ищем в цикле нужный элемент с тегами (elements_as_string)
        # передаем найденный элемент в out_formatter
        for text, element in zip(raw_cleaned_elements, elements_as_string):
            if best_nodes[0][0] == text:
                node_to_format = element

        # out_formatter подготавливает текст для сохранения
        clean_text = formatter.format_article(node_to_format)

        # сохраняем в текстовый файл
        with codecs.open('output.txt', 'w', 'utf-8') as out:
            out.write(title + '\n\n')
            for paragraph in clean_text:
                for line in paragraph:
                    out.write(line)
                    out.write('\n')
                out.write('\n')
Esempio n. 12
0
 def process_files(self):
     files=[f for f in listdir(self.path)]
     files_dic={}
     for file in files:
         #process the file based on the file extension
         file_ext=file.split('.')[-1]
         if file_ext=='txt':
             files_dic[file]=self.process_txt(self.path+file)
         elif file_ext=='pdf':
             files_dic[file]=self.process_pdf(self.path+file)
         elif file_ext=='html':
             files_dic[file]=self.process_html(self.path+file)
     tp=TextProcessor()
     for file, text in files_dic.items():
         #call the text_processor module
         text_proc_result=tp.process(JSONEncoder().encode({'action':'process', 'data':text}))
         text_proc_result=JSONDecoder().decode(text_proc_result)['terms']
         files_dic[file]=text_proc_result
     return files_dic
Esempio n. 13
0
    def __getitem__(self, idx):

        textprocessor = TextProcessor(VOCAB_SIZE=self.VOCAB_SIZE)
        utils = Utils()

        video_file = self.train_dir_list[
            idx]  # get video file corresponding to the id, idx

        output_text = self.utils.output_text(
            self.train_corpus,
            video_file)  # get the text contained in the video file

        #### generate input 2,  from the output_text
        sentence_to_index = textprocessor.sentence_to_indices(
            utils.tagger_input(utils.clean_text(output_text)),
            self.word_to_index)
        X_2 = textprocessor.get_output(sentence_to_index, self.NUMBER_OF_WORDS)

        #### generate output,  from the output_text
        sentence_to_index = textprocessor.sentence_to_indices(
            utils.tagger_output(utils.clean_text(output_text)),
            self.word_to_index)
        y = textprocessor.get_output(sentence_to_index, self.NUMBER_OF_WORDS)

        video_path = self.train_dir + video_file

        # generate input 1
        X_1 = utils.video_to_frames(video_path, self.number_of_frames,
                                    self.device, self.INPUT_SIZE, self.model,
                                    self.transform)
        #X_1 = pre_data[idx]
        return (X_1, torch.tensor(X_2)), torch.tensor(y)
Esempio n. 14
0
def main():
    args = parser.parse_args()

    with open(args.classifier, 'r') as f:
        serialized_classifier = f.read()

    processor = TextProcessor()
    classifier = Classifier.load(serialized_classifier, processor)

    for example in read_dataset(args.data):
        text = example['content']
        predicted_tag = classifier.classify(text)
        print predicted_tag
Esempio n. 15
0
def main():
    args = parser.parse_args()
    data_json = read_dataset(args.data)

    processor = TextProcessor()
    classifier = Classifier(processor)
    classifier.train(data_json)

    serialized_classifier = classifier.dump()

    ensure_directory(args.output)
    with open(args.output, 'w') as f:
        f.write(serialized_classifier)
        f.write(os.linesep)
Esempio n. 16
0
def main():
    args = parser.parse_args()
    data_json = read_dataset(args.data)
    random.shuffle(data_json)

    training_set_ratio = 0.7
    training_set_size = int(training_set_ratio * len(data_json) + 0.5)

    training_set = data_json[:training_set_size]
    test_set = data_json[training_set_size:]

    processor = TextProcessor()
    classifier = Classifier(processor)
    classifier.train(training_set)

    print classifier.dump() == Classifier.load(classifier.dump(),
                                               processor).dump()
Esempio n. 17
0
def response():
    link = request.form['link']
    if TextProcessor.validate_link(link):
        git_history = GitProcessor.get_history(repo_link=link,
                                               num_of_commits=INITIAL_COMMITS)
        if len(git_history) > 0:
            start_default_value = git_history[0][0]
            end_default_value = git_history[len(git_history) - 1][0]
        else:
            start_default_value = ""
            end_default_value = ""

        return render_template(template_name_or_list="report_generator.html",
                               history=git_history,
                               repo_link=link,
                               start_default_value=start_default_value,
                               end_default_value=end_default_value)
    else:
        return render_template("main.html", text="Invalid Git Repository Link")
Esempio n. 18
0
    def ImportLocalizedFiles(self):
        """If new localized files are available, import them.

    This method sifts through the available localized files, and imports each
    to the correct location under CONTENT_ROOT.
    """
        for locale in self.new_localizations:
            if os.path.isfile(self.GetLocalizedFilePath(locale)):
                try:
                    os.mkdir(os.path.dirname(self.GetOriginalFilePath(locale)))
                except OSError:
                    pass
                in_path = self.GetLocalizedFilePath(locale)
                out_path = self.GetOriginalFilePath(locale)
                with codecs.open(in_path, 'r', 'UTF-8') as infile:
                    with codecs.open(out_path, 'w', 'UTF-8') as outfile:
                        temp = TextProcessor(html=infile.read())
                        outfile.write(temp.django)
        self.__available_localizations = []
        self.__locales = []
Esempio n. 19
0
class TestIsWord:

    tp = TextProcessor()

    def test_one_word_latin(self):
        assert self.tp.is_word('Test')

    def test_one_word_cyr(self):
        assert self.tp.is_word('Тест')

    def test_two_words(self):
        assert not self.tp.is_word('Test test')

    def test_word_with_num(self):
        assert self.tp.is_word('Test1')

    def test_word_with_underscore(self):
        assert self.tp.is_word('Test_test')

    def test_words_devided_by_symbol(self):
        assert not self.tp.is_word('Test*test')
Esempio n. 20
0
def main():
    args = parser.parse_args()
    full_data_json = read_dataset(args.data)

    # for n in xrange(30, len(full_data_json), 30):
    for n in [len(full_data_json)]:

        corrects = 0
        total = 0

        for _ in xrange(SAMPLES):

            random.shuffle(full_data_json)
            data_json = full_data_json[:n]

            training_set_ratio = 0.7
            training_set_size = int(training_set_ratio * len(data_json) + 0.5)

            training_set = data_json[:training_set_size]
            test_set = data_json[training_set_size:]

            processor = TextProcessor()
            classifier = Classifier(processor)
            classifier.train(training_set)

            for example in test_set:
                text = example['content']
                predicted_tag = classifier.classify(text)
                expected_tag = classifier.normalize_tag_label(example['tag'])
                if expected_tag in Classifier.IGNORE_TAGS:
                    continue
                if predicted_tag == expected_tag:
                    corrects += 1
                else:
                    # print 'expected = {}, predicted = {}'.format(expected_tag, predicted_tag)
                    pass
                total += 1

        print '{} {}'.format(len(data_json), float(corrects) / total)
Esempio n. 21
0
    path = dict(cf.items("file_path"))
    dir_w2v = path['dir_w2v']

    print('Loading word2vec model...')
    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
        dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore")

    texts, y_true = load_validation_file_csv(VALIDATION_FILE)

    print('Loading ' + MODEL_FILE + ' file...')

    model = joblib.load(MODEL_FILE)
    pol = ''
    n_pol = ''
    y_pred = list()
    tp = TextProcessor()
    texts = tp.text_process(texts, text_only=True)
    X = gen_data(texts)

    mean_auc, std_auc = generate_roc_curve(
        model, X, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE))

    print('Predicting...')

    y_pred = model.predict(X)

    print('Classification Report')
    print(classification_report(y_true, y_pred))
    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred)

    model_name = MODEL_FILE.replace(SKL_FOLDER, '')
Esempio n. 22
0
    file = "/Users/lucasso 1/Documents/validation/nao_politicos.json"

    data = {
        'favorites': [],
        'user_id': [],
        'text': [],
        'retweets': [],
        'created_at': [],
        'tweet_id': [],
        'user_screen_name': []
    }

    for t in load_tweets(file):
        data['user_id'].append(t['user_id'])
        data['favorites'].append(t['favorites'])
        data['text'].append(t['text'])
        data['retweets'].append(t['retweets'])
        data['created_at'].append(t['created_at'])
        data['tweet_id'].append(t['tweet_id'])
        data['user_screen_name'].append(t['user_screen_name'])

    df = pd.DataFrame(data)
    df['created_at'] = pd.to_datetime(df['created_at'], unit='ms')
    df = df.set_index('created_at')
    df = df.sort_index(ascending=True)

    tp = TextProcessor()
    df['text_processed'] = tp.text_process(df.text.tolist(), hashtags=True)
    df['political'] = 0
    file = file.replace('json', 'pck')
    df.to_pickle(file)
Esempio n. 23
0
"""

sort_tfidf = sorted(dic_tfidf, key=lambda x: x[1], reverse=True)
sort_tf_log_idf = sorted(dic_tf_log_idf, key=lambda x: x[1], reverse=True)
sort_tfidf_like = sorted(dic_tfidf_like, key=lambda x: x[1], reverse=True)

plot_tfidfs(sort_tfidf, sort_tf_log_idf, sort_tfidf_like)

n = 2000
plot_cloud(sort_tfidf,n,"dic_tfidf")
plot_cloud(sort_tf_log_idf,n,"dic_tf_log_idf")
plot_cloud(sort_tfidf_like,n,"dic_tfidf_like")

dir_path = "/Users/lucasso/Documents/tweets/"
tp = TextProcessor()
tw_files = ([file for root, dirs, files in os.walk(dir_path)
            for file in files if file.endswith('.json') ])

tw_list = list()
tweets = list()
for tw_file in tw_files:
    with open(dir_path+tw_file) as data_file:
        doc_list = list()
        for line in data_file:
            tweet = json.loads(line)
            doc_list.append(tweet['text'])
    tw_list.append(list(itertools.chain.from_iterable(tp.text_process(doc_list))))

for i in range(len(tw_list)):
    plot_dep_cloud(tw_list[i],sort_tfidf,n,tw_files[i]+"_dic_tfidf")
Esempio n. 24
0
 def get_text(self, row):
     guid = row['GUID']
     text_processor = TextProcessor(guid, self.file_dictionary)
     return text_processor.get_text()
Esempio n. 25
0
from text_processor import TextProcessor
import configparser
import pickle
import numpy as np
import csv
import random

if __name__ == '__main__':
    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_btm = path['dir_btm']
    dir_in = path['dir_in']
    dir_out = path['dir_out']
    dir_down = path['dir_down']
    tp = TextProcessor()

    f = open(dir_down + "sanders-twitter-0.2/full-corpus.csv", "rt")
    twitter = csv.reader(f, delimiter=',')

    tweets = list()
    for tw in twitter:
        tweets.append(tw)

    random.shuffle(tweets)

    topic = list()
    txt = list()
    for tw in tweets:
        topic.append(tw[0])
        txt.append(tw[4])
Esempio n. 26
0
def days2time(days):
    #1380844800000  = 04/10/2013, 86400000 = 1 day 
    return 1380844800000+(days*86400000)




      
if __name__=='__main__':

    dir_out = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/grafos/"
    name = "74173_DeputadoEduardoCunha"
    filedir = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/coleta_pedro/"+name+".json"
    lamb_dir = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/coleta_pedro/lambdas/lambdats/"+name+"_wsize7.dat"
    tp = TextProcessor()

    with open(filedir) as data_file:
        doc_set = list()
        doc_tw = set()
        dc =set()
        weeks = list()
        dist = list()
        lamb = list()
        inicial = 1
        final = 603
        for line in data_file:
            tweet = json.loads(line)
            created = int(tweet['created_at'])
            if(days2time(inicial) <= created < days2time(final)):
                doc_tw.add(tweet['text'])
Esempio n. 27
0
    #model = gensim.models.LdaModel.load('android.lda')
    print(ldamodel.print_topics())
    #ldamodel.print_topics()
    return ldamodel


if __name__=='__main__':

cf = configparser.ConfigParser()
cf.read("file_path.properties")
path = dict(cf.items("file_path"))
dir_in = path['dir_in']
dir_out = path['dir_out']
dir_ale = path['dir_ale']

tp = TextProcessor()

with open(dir_out+"list_parl_tw_bi_trigrams2.pck",'rb') as handle:
    parl_tweets = pickle.load(handle)

with open(dir_out+"tfidf_like_bi_trigrams.pck",'rb') as handle:
    tfidf_like_bi_trigrams = pickle.load(handle)

dic_words = dict(sort_tfidf_like[1:15000])

list_tw_parl = list()
for parl in parl_tweets:
    temp = list()
    for tw in parl:
        temp.append(list([x for x in tw if x in dic_words]))
    list_tw_parl.append(temp)
Esempio n. 28
0
def main(argv=None):
    """
    Training.
    """

    ### parametres

    LEARNING_RATE = FLAGS.LEARNING_RATE
    NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES
    BATCH_SIZE = FLAGS.BATCH_SIZE
    EPOCH = FLAGS.EPOCH
    TRAINING_DEVICE = FLAGS.TRAINING_DEVICE
    VOCAB_SIZE = FLAGS.VOCAB_SIZE
    NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS
    HIDDEN_SIZE = FLAGS.HIDDEN_SIZE
    INPUT_SIZE = FLAGS.INPUT_SIZE
    NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS
    tsfm = transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    train_dir = FLAGS.train_dir  #'D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/'
    train_corpus = FLAGS.train_corpus  #'D:/Machine_Learning/datasets/video_corpus/video_corpus.csv'

    print("train_dir is =", train_dir)
    print("train_corpus =", train_corpus)

    utils = Utils()
    all_text = utils.output_text(train_corpus)
    text_processor = TextProcessor(freq_threshold=10)
    dictionary = text_processor.vocab_creator(all_text)

    ### training data preparation
    train_ds = CustomDataset(train_dir,
                             train_corpus,
                             device,
                             dictionary,
                             VOCAB_SIZE,
                             NUMBER_OF_WORDS,
                             INPUT_SIZE,
                             NUMBER_OF_FRAMES,
                             tsfm,
                             model=md.model_vgg)
    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

    ### Model definition
    encoder = Encoder_LSTM(input_size=INPUT_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS)
    decoder = Decoder_LSTM(input_size=VOCAB_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS,
                           number_of_words=NUMBER_OF_WORDS)
    model_seq_to_seq = Seq2Seq(encoder, decoder).to(device)
    model = model_seq_to_seq

    ### load the state_dict of model if model has been pretrained.
    if (FLAGS.load_weights):
        print("there are weights to be loaded")

        model.load_state_dict(torch.load(FLAGS.load_weights))

    ### optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    #### Model Training

    import time
    print_feq = 1
    best_loss = np.inf
    for epoch in range(1, EPOCH + 1):
        model.train()
        epoch_loss = 0

        for step, (img, label) in enumerate(train_dl):

            time_1 = time.time()  ## timing

            X_1, X_2 = img  ### get inputs

            X_1 = X_1.to(device)  # Set device
            X_2 = X_2.to(device)  # Set device

            label = label.to(device)  # Set output device

            ### zero the parameter gradients
            optimizer.zero_grad()

            ### forward
            prediction = model(X_1, X_2)

            ### Optimize
            prediction = prediction.to(device)
            prediction = torch.squeeze(prediction, 0)
            label = torch.squeeze(label, 0)

            new_label = torch.zeros([label.shape[0]])
            for l in range(label.shape[0]):
                new_label[l] = np.argmax(label[l].cpu())
            new_label = new_label.to(device)
            loss = criterion(prediction, new_label.long())

            # Backward prop.
            loss.backward()
            optimizer.step()

            ### print out statistics
            epoch_loss += loss.item()
            if step % print_feq == 0:
                print('epoch:', epoch, '\tstep:', step + 1, '/',
                      len(train_dl) + 1, '\ttrain loss:',
                      '{:.4f}'.format(loss.item()), '\ttime:', '{:.4f}'.format(
                          (time.time() - time_1) * print_feq), 's')

        ### save best model
        if (epoch_loss < best_loss):
            best_loss = epoch_loss

            model_name = 'MODEL_SEQ2SEQ' + 'VOCAB_SIZE_' + str(
                VOCAB_SIZE) + 'NUMBER_OF_WORDS_' + str(
                    NUMBER_OF_WORDS
                ) + 'HIDDEN_SIZE_' + str(HIDDEN_SIZE) + 'INPUT_SIZE_' + str(
                    INPUT_SIZE) + 'NUMBER_OF_LAYERS_' + str(NUMBER_OF_LAYERS)
            torch.save(model.state_dict(), model_name + '.pth')

        print("The loss for this epoch is = :", epoch_loss / len(train_dl))
Esempio n. 29
0
    parser.add_argument('-h5', default=H5_FILE)
    parser.add_argument('-npy', default=NPY_FILE)
    parser.add_argument('-vf', '--validationfile', required=True)

    args = parser.parse_args()
    H5_FILE = args.h5
    NPY_FILE = args.npy
    VALIDATION_FILE = args.validationfile

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']

    X, y_true = load_validation_file_csv(VALIDATION_FILE)
    tp = TextProcessor()

    pc = PoliticalClassification(H5_FILE, NPY_FILE, 25)

    pol = ''
    n_pol = ''
    y_pred = list()
    X = tp.text_process(X, text_only=True)
    for tx in X:
        text = ' '.join(tx)
        if pc.is_political(text):
            pol += text + '\n'
            y_pred.append(1)
        else:
            n_pol += text + '\n'
            y_pred.append(0)
Esempio n. 30
0
            
if __name__=='__main__':

    cf = configparser.ConfigParser()
    cf.read("../file_path.properties")
    path = dict(cf.items("file_path"))
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_tw = path['dir_tw']

    print("load tweet files")
    fnames = ([file for root, dirs, files in os.walk(dir_tw)
            for file in files if file.endswith('.json')  ])

    categories_tw = list()
    tp = TextProcessor()
    for fl in fnames:
        categories_tw.append(tp.text_process(read_tweets(fl)))

    categories_counter = list()
    test_data = list()
    for categ in categories_tw:
        k = int(len(categ) * 0.2)
        random.shuffle(categ)
        tmp = list(itertools.chain.from_iterable(categ[k:]))
        categories_counter.append(Counter(tmp))
        test_data.append(categ[:k])


    print("process tfidf")
    tfidf_entropy = list()
Esempio n. 31
0
    for tw_file in tw_files:
        with open(dir_in+tw_file) as data_file:
            for line in data_file:
                tweet = json.loads(line)
                doc_list.append(tweet['text'])           
    return doc_list



if __name__=='__main__':

    dir_in= "/Users/lucasso/Documents/pck/"
    dir_ent = "/Users/lucasso/Documents/tweets_pedro/"
    dir_out= "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/"
    doc_list = load_files(dir_ent)
    tp = TextProcessor()
    tweets = tp.text_process(doc_list)
    word_list = set(load_file(dir_out,"word_list.pck"))
    #lista já processada sem entropia 0 e ration >1. remove todas as outras palavras que não interessam dos tweets
    tweets =[[i for i in t if i in word_list] for t in tweets] 
    hashtags = re.compile(r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""")
    hs_set =set()

    hastgs_list = list()
    for tweet in tweets:
        v = ','.join(hashtags.findall( ' '.join(tweet)))
        l = hashtags.findall( ' '.join(tweet))
        hastgs_list.append(l)
        hs_set |= set(v.split(","))

    hastgs_list = [e for e in hastgs_list if e] # remove as listas em branco     
Esempio n. 32
0
import os
import sys

currentdir = os.path.dirname(os.path.realpath(__file__))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir + '/source')
sys.path.append(parentdir)

import csv
from text_processor import TextProcessor

#read csv, and split on "," the line
csv_file = open('tweet_data/ShardiB2_2021-02-03 05_29_05.236426_tweets.csv',
                "r")

text_pro = TextProcessor()
x = open(
    "tweet_data/no_links or tickers_ShardiB2_2021-02-03 05_29_05.236426_tweets.csv",
    "w")

#loop through the csv list
for row in csv_file:
    replaced_row = text_pro.replace_tickers_with_company_names(row)
    #uncomment below to also remove links
    index = replaced_row.find("https")
    if index != -1:
        replaced_row = replaced_row[:index] + ", \n"
    x.writelines(replaced_row)

x.close()
Esempio n. 33
0
    def __worker(self, pipe, l_log):
        """The core of the STT program, this is the multiprocessed part

        Note:
            Multiprocessing will require a pipe between the parent and child subprocess.
            Since this is the case, the worker subprocess cannot access non-shared variables

        """

        l_log.debug("STT worker started")

        audio_processor = AudioProcessor(
        )  # Create a new audio processing object
        text_processor = TextProcessor(
        )  # Remember that we can't load the text processor nltk model until the nltk model is set from the client language
        config = Decoder.default_config(
        )  # Create a new pocketsphinx decoder with the default configuration, which is English
        decoder = None
        nltk_model = None
        mutex_flags = {"keyphrases": {"use": False}}
        shutdown_flags = {"shutdown": False, "decoder": None}

        def send_json(pipe, to_send):
            """Internal worker method to send a json through the parent socket

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                to_send (:obj: dict): A dictionary to be sent to the parent socket

            """
            try:
                ret = self.__send_buffered(
                    pipe, to_send
                )  # Send the message passed by argument back to the parent process
                if not ret[0]:
                    l_log.error(
                        "Failed to send buffered message to the parent process! (err: %s)"
                        % ret[1])
            except Exception as err:
                l_log.error("Failed to send json! (err: %s)" % str(err))

        def send_error(pipe, error):
            """Internal worker method to send a json error through the parent socket

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                error (str): The string error message to send

            """
            send_json(pipe, {"error": error})

        def load_models(pipe, config, models):
            """Internal worker method to load the language model

            Note:
                Some lanaguages take a long time to load. English is by far
                the fastest language to be loaded as a model.
            
            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                models (dict): The language and nltk models developed by the parent process
           
            Returns: (Decoder)
                The STT decoder object and the nltk model

            """

            language_model = models["language_model"]
            nltk_model = models["nltk_model"]

            if False in [
                    language_model.is_valid_model(),
                    nltk_model.is_valid_model()
            ]:
                l_log.error("The language model %s is invalid!" %
                            str(language_model.name))
                send_error(pipe, "Failed loading language model!")
                return

            # Load the model configurations into pocketsphinx
            config.set_string('-hmm', str(language_model.hmm))
            config.set_string('-lm', str(language_model.lm))
            config.set_string('-dict', str(language_model.dict))
            decoder = Decoder(config)

            send_json(
                pipe,
                {"success": True})  # Send a success message to the client

            l_log.debug("Set the language model to %s" %
                        str(language_model.name))

            return decoder, nltk_model  # Return the new decoder and nltk model

        def process_text(pipe, text, is_final, args):
            """Internal worker method to process the Speech To Text phrase

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                text (str): The spoken text to further process
                is_final (boo): If the text being processed is the final text else it's a partial result
                args (dict): Any other flags specifically required for a final or partial speech result
            """

            generate_keyphrases = mutex_flags["keyphrases"]["use"]
            keyphrases = []

            if generate_keyphrases:
                text_processor.generate_keyphrases(
                    text)  # Generate keyphrases from the given text
                keyphrases_list = text_processor.get_keyphrases()

                for keyphrase in keyphrases_list:
                    to_append_keyphrase = {
                        "score": keyphrase[0],
                        "keyphrase": keyphrase[1]
                    }
                    keyphrases.append(to_append_keyphrase)
            else:
                keyphrases = text  # Don't do any processing and just pass the text into the keyphrases

            # Generate the json to be sent back to the client
            hypothesis_results = args
            hypothesis_results["keyphrases"] = generate_keyphrases
            if is_final:
                hypothesis_results["hypothesis"] = keyphrases
            else:
                hypothesis_results["partial_hypothesis"] = keyphrases

            print(hypothesis_results)

            # Send the results back to the client
            send_json(pipe, hypothesis_results)

        def start_audio(pipe, decoder, args):
            """Internal worker method to start the audio processing chunk sequence

            Note:
                This must be called before the process_audio method or the STT engine will not process the audio chunks

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                decoder (Decoder): The pocketsphinx decoder to control the STT engine
                args (dict): All of the available arguments passed by the parent process

            """

            if decoder is None:
                l_log.error("Language model is not loaded")
                send_error(pipe, "Language model not loaded!")
                send_json(pipe, {"decoder": False})
                return

            l_log.debug("Starting the audio processing...")

            decoder.start_utt()  # Start the pocketsphinx listener

            # Tell the client that the decoder has successfully been loaded
            send_json(pipe, {"decoder": True})

        def process_audio(pipe, decoder, args):
            """Internal worker method to process an audio chunk

            Note:
                The audio chunk is expected to be in base64 format

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                decoder (Decoder): The pocketsphinx decoder to control the STT engine
                args (dict): All of the available arguments passed by the parent process

            """
            if decoder is None:
                l_log.error("Language model is not loaded")
                send_error(pipe, "Language model not loaded!")
                return

            l_log.debug("Processing audio chunk!")

            audio_chunk = args["audio"]  # Retrieve the audio data
            processed_wav = audio_processor.process_chunk(
                audio_chunk)  # Process the base64 wrapped audio data

            l_log.debug("Recognizing speech...")

            decoder.process_raw(
                processed_wav, False,
                False)  # Process the audio chunk through the STT engine

            hypothesis = decoder.hyp()  # Get pocketshpinx's hypothesis

            # Send back the results of the decoding
            if hypothesis is None:
                l_log.debug("Silence detected")
                send_json(pipe, {
                    "partial_silence": True,
                    "partial_hypothesis": None
                })
            else:
                hypothesis_results = {
                    "partial_silence":
                    False if len(hypothesis.hypstr) > 0 else True,
                }

                l_log.debug("Partial speech detected: %s" %
                            str(hypothesis.hypstr))
                process_text(pipe, hypothesis.hypstr, False,
                             hypothesis_results)

            l_log.debug("Done decoding speech from audio chunk!")

        def stop_audio(pipe, decoder, args):
            """Internal worker method to stop the audio processing chunk sequence

            Note:
                This must be called after the process_audio method or the STT engine will continue to listen for audio chunks

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                decoder (Decoder): The pocketsphinx decoder to control the STT engine
                args (dict): All of the available arguments passed by the parent process

            """

            if decoder is None:
                l_log.error("Language model is not loaded")
                send_error(pipe, "Language model not loaded!")
                send_json({"decoder": False})
                return

            l_log.debug("Stopping the audio processing...")

            decoder.end_utt()  # Stop the pocketsphinx listener

            l_log.debug("Done recognizing speech!")

            hypothesis = decoder.hyp()  # Get pocketshpinx's hypothesis
            logmath = decoder.get_logmath()

            # Send back the results of the decoding
            if hypothesis is None:
                l_log.debug("Silence detected")
                send_json(pipe, {"silence": True, "hypothesis": None})
            else:
                hypothesis_results = {
                    "silence": False if len(hypothesis.hypstr) > 0 else True,
                    "score": hypothesis.best_score,
                    "confidence": logmath.exp(hypothesis.prob)
                }

                l_log.debug("Speech detected: %s" % str(hypothesis.hypstr))
                process_text(pipe, hypothesis.hypstr, True, hypothesis_results)

        def shutdown_thread(self, l_log):
            """Worker method to handle the checking of a shutdown call

            Note:
                To reduce overhead, this thread will only be called every 100 milliseconds

            """
            while not shutdown_flags["shutdown"]:
                try:
                    if self._shutdown_event.is_set():
                        l_log.debug("Shutting down worker thread!")
                        shutdown_flags["shutdown"] = True  # Exit the main loop
                        if shutdown_flags["decoder"] is not None:
                            try:
                                shutdown_flags["decoder"].end_utt()
                            except Exception as err:
                                l_log.debug(
                                    "STT decoder object returned a non-zero status"
                                )
                        else:
                            l_log.warning(
                                "The decoder object is already None!")

                        break
                    sleep(0.1)
                except Exception as err:
                    l_log.error(
                        "Failed shutting down worker thread! (err: %s)" %
                        str(err))

        shutdown_t = Thread(target=shutdown_thread, args=(
            self,
            l_log,
        ))
        shutdown_t.setDaemon(True)
        shutdown_t.start()

        p_out, p_in = pipe
        while not shutdown_flags["shutdown"]:
            try:
                try:
                    command = self.__get_buffered(
                        p_out)  # Wait for a command from the parent process
                    if "set_models" in command[
                            "exec"]:  # Check to see if our command is to
                        decoder, nltk_model = load_models(
                            p_out, config, command["args"])
                        text_processor.set_nltk_model(
                            nltk_model)  # Set the text processor nltk model
                        shutdown_flags["decoder"] = decoder
                    elif "start_audio" in command["exec"]:
                        start_audio(p_out, decoder, command["args"])
                    elif "process_audio" in command["exec"]:
                        process_audio(p_out, decoder, command["args"])
                    elif "stop_audio" in command["exec"]:
                        stop_audio(p_out, decoder, command["args"])
                    elif "set_keyphrases" in command["exec"]:
                        mutex_flags["keyphrases"] = command["args"]
                    else:
                        l_log.error("Invalid command %s" % str(command))
                        send_error(socket, "Invalid command!")
                except (EOFError, IOError) as err:
                    continue
            except Exception as err:
                l_log.error(
                    "Failed recieving command from subprocess (id: %d) (err: %s)"
                    % (current_process().pid, str(err)))
Esempio n. 34
0
    np.random.seed(SEED)
    print('W2VEC embedding: %s' % (W2VEC_MODEL_FILE))
    print('Embedding Dimension: %d' % (EMBEDDING_DIM))
    print('Allowing embedding learning: %s' % (str(LEARN_EMBEDDINGS)))

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_w2v = path['dir_w2v']
    dir_in = path['dir_in']

    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
        dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore")

    tp = TextProcessor()

    texts = list()
    tx_class = list()

    tmp = list()
    with open(POLITICS_FILE) as l_file:
        for line in l_file:
            tmp.append(line)
            tx_class.append('politics')

    texts += tp.text_process(tmp, text_only=True)

    tmp = list()
    with open(NON_POLITICS_FILE) as l_file:
        for line in l_file:
Esempio n. 35
0
        parl_tw_list.append(temp)
    return doc_list, parl_tw_list 


if __name__=='__main__':

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_rob = path['dir_rob']

    doc_list, parl_tw_list = load_files(dir_rob)
    tp = TextProcessor()

    parl_tw_processed = list()
    for l in parl_tw_list:
        parl_tw_processed.append(tp.text_process(l, text_only=True))


    with open(dir_in+"coleta1.pck",'rb') as handle:
        coleta1 = pickle.load(handle)

    with open(dir_in+"coleta2.pck",'rb') as handle:
        coleta2 = pickle.load(handle)

    tweets = list(itertools.chain.from_iterable(list(itertools.chain.from_iterable(parl_tw_processed))))
    tot_counter = Counter(tweets)
Esempio n. 36
0
if __name__ == "__main__":
    pp = pprint.PrettyPrinter(indent=4, depth=2)

    # Initialize classifier
    classifier = NaiveBayes()

    # Train
    for f in find("data/1/training"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)

        label = "movie" if "movie" in f else "play"

        classifier.train(sentences, label=label)

    # Test
    for f in find("data/1/testing"):
        f = f.strip()
        if not f.endswith(".txt"):
            continue

        with open(f) as doc:
            text = doc.read()

        sentences = nlp.process_text(text)
class ArticleExtractor(object):
    """
    Класс для подсчета частотностей сигнальных слов в элементах
    и вычисления элемента, содержащего новостную статью.
    """

    def __init__(self, languge):

        self.settings = Settings()

        self.language = languge
        self.text_processor = TextProcessor(self.language)
        self.signalwords = self.settings.signal_words_ru

    def count_freqs(self, stemmed_text):
        """
        Методы считает относительную частотность
        сигнальных слов в тексте.
        Возвращет список кортежей, первым элементов которого
        является слово, вторым его относительная частота.
        """

        stemfreqs = defaultdict(int)

        for stem in stemmed_text:
            stemfreqs[stem] += 1

        total_stems_in_text = float(len(stemmed_text))

        stems = [(word, freq / total_stems_in_text) for word, freq in stemfreqs.iteritems()]

        sorted_freqs = sorted(((term, round(frequency, 3)) for term, frequency in stems),
                              key=lambda w: w[1], reverse=True)

        return sorted_freqs

    def count_signalwords_in_html(self, texts):

        signalwords_to_text = {}

        for raw_element, stemmed_text in texts.iteritems():
            signalwords_count = self.count_freqs(stemmed_text)

            signalwords_to_text[raw_element] = Counter({term[0]: term[1] for term in signalwords_count})

        return signalwords_to_text

    def count_signalwords_in_file(self):

        signalwords = self.signalwords

        signalwords_terms = [term for term in self.text_processor.process_line(signalwords)]

        signalwords_freqs = self.count_freqs(signalwords_terms)

        return Counter({term[0]: term[1] for term in signalwords_freqs})

    @staticmethod
    def cosine_similarity(html_freqdict, signalwords_freqdict):
        """
        Вычисление косинусного коэффициента
        https://en.wikipedia.org/wiki/Cosine_similarity
        """

        terms = set(html_freqdict.keys()).union(set(signalwords_freqdict.keys()))

        doc_vector = [html_freqdict[k] for k in terms]
        signals_vector = [signalwords_freqdict[k] for k in terms]

        doc_vector = numpy.asanyarray(doc_vector, dtype=float)
        signals_vector = numpy.asanyarray(signals_vector, dtype=float)

        dot_product = 0.0
        for v1, v2 in zip(doc_vector, signals_vector):
            dot_product += v1*v2

        magnitude_v1 = math.sqrt(sum(i1**2 for i1 in doc_vector))
        magnitude_v2 = math.sqrt(sum(i2**2 for i2 in signals_vector))

        if magnitude_v2 != 0 and magnitude_v1 != 0:

            return dot_product / (magnitude_v1 * magnitude_v2)
        else:
            return 0.0

    def find_best_node(self, texts_stemmed):
        
        """
        Для каждого элемента вычисляется косинусный коэффициент.
        Элементы сортируются по убыванию значения коэффициента.
        Возвращается ранжированный список кортежей,
        первый элемент которых - текст, второй - его кос. коэфф.
        """

        result_dict = {}

        docs_index = self.count_signalwords_in_html(texts_stemmed)
        signals_freqdict = self.count_signalwords_in_file()

        for tag_element_text, element_terms_dict in docs_index.iteritems():

            cossim = self.cosine_similarity(element_terms_dict, signals_freqdict)
            result_dict[tag_element_text] = cossim

        # mean = sum(result_dict.values()) / len(result_dict)

        relevant_elements = sorted(((text, cos) for text, cos in result_dict.iteritems()),
                                   key=lambda w: w[1], reverse=True)

        return relevant_elements
Esempio n. 38
0
import json
import pymongo
import configparser
import os
from text_processor import TextProcessor
import xlrd
cf = configparser.ConfigParser()
cf.read("file_path.properties")
path = dict(cf.items("file_path"))
dir_in = path['dir_in']
dir_xls = path['dir_xls']

client = pymongo.MongoClient("mongodb://localhost:27017")
db = client.twitterdb
tp = TextProcessor()
tw_files = sorted([file for root, dirs, files in os.walk(dir_in)
        for file in files if file.endswith('.json')])
excel = True

if excel:

    sheet_name = "nao_eleitos"
    col = 4
    rep_dic = {}
    for fname in tw_files:
        rep_dic[fname.split('_',1)[0]] = fname
    xls = xlrd.open_workbook(dir_xls)
    sheet = xls.sheet_by_name(sheet_name)
    for i in range(sheet.nrows):
        id_rep = str(int(sheet.cell_value(rowx= i, colx=col)))
        if (id_rep in rep_dic):
Esempio n. 39
0
 def setText(self, text):
   tp = TextProcessor(text)
   self.text = tp.getProcessedText()
Esempio n. 40
0
    for tw in dep:
        tw_dic[str(month_tw(tw[0]))+"_"+str(i)]=tw[1]
        save_file(dir_out,tw[1],i,month_tw(tw[0]))

random_pck =list()
with open(dir_out+"random-pck/coleta1.pck",'rb') as handle:
    random_pck.append(pickle.load(handle))

with open(dir_out+"random-pck/coleta2.pck",'rb') as handle:
    random_pck.append(pickle.load(handle))

month_files = list()
for m in range(10):
    month_files.append(load_files(dir_out+"tw_month/month_"+str(m)+"/"))

tp = TextProcessor()
month_processed =list()
for tw in month_files:
    tmp = list()
    for dep in tw:
        tmp.append(tp.text_process(dep,text_only=True))
    month_processed.append(tmp)

ranked_month = list()
for i,month in enumerate(month_processed):
    tmp = tfidf_month(month,random_pck)
    ranked_month.append(tmp)
    save_pck(dir_out+"tw_month/month_"+str(i)+"/",tmp)

tfidf = TfIdf()
Esempio n. 41
0
def process_words():
    textProcessor = TextProcessor()
    processed_text = textProcessor.process(request.args.get('text'))
    return jsonify(response=processed_text)
Esempio n. 42
0
    #1380844800000  = 04/10/2013, 86400000 = 1 day 
    return 1380844800000+(days*86400000)

def days2timeInterval(day1, day2):
    #1380844800000  = 04/10/2013, 86400000 = 1 day 
    return (1380844800000+(day1*86400000)), (1380844800000+(day2*86400000))

if __name__=='__main__':

    dir_in = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/coleta_pedro/"
    dir_out = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/"
    excel_path = "/Users/lucasso/Dropbox/Twitter_Marcelo/Arquivo Principal da Pesquisa - Quatro Etapas.xls"
    sheet_name = "amostra"
    col = 4
    rt = ReadTwitter(dir_in, excel_path, sheet_name, col )
    tp = TextProcessor()
   

    id_rep, names = rt.names_from_xls()
    
    for idx in range(len(names)):

        tweets = list()
        graphs = list()
        tw = nx.Graph()
        data = rt.tweets_election_data(id_rep[idx])
        
        diction = {k:v for (k,v) in data.items()}

        for i in diction:
            tweets.append(list(itertools.chain.from_iterable(tp.text_process(diction[i].split()))))
Esempio n. 43
0
def process_file(file_id):
    file = File.objects.get(pk=file_id)
    print(file)
    try:
        origin_path = file.origin_file.path
    except ValueError:
        origin_path = None
    file.input_type = get_input_type(file)
    print(file.input_type)
    file.progress += 10
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()


    document = None
    if file.input_type == File.InputTypes.IMAGE:
        document = Document()
        text = image_to_text(origin_path)
    elif file.input_type == File.InputTypes.TEXTBOX:
        text = file.origin_text
    else:
        document = Document(origin_path, text_params)
        text = document.parse()
    # file.progress += 50
    file.progress += 20
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()
    sleep(.5)
    file.progress += 10
    file.save()

    text_processor = TextProcessor()
    processed_text = text_processor.process(text)
    # file.progress += 30
    file.progress += 20
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()

    if file.input_type == File.InputTypes.TEXTBOX:
        file.processed_text = processed_text

    else:
        if document is None:
            raise ValueError('Error with document')
        output_name = get_output_field(file)
        document.change_text(processed_text)
        document.save(file.processed_file.storage.path(output_name))
        file.processed_file = output_name
    sleep(.5)
    file.progress = 100
    file.save()
    print(file)
Esempio n. 44
0
def main():
    device = torch.device('cuda')

    embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

    text_processor = TextProcessor(
        wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
        tokenizer=get_tokenizer('basic_english'),
        standardize=True,
        min_len=3,
    )

    dataset = TextDataset(CORPUS_DIR, text_processor)

    # split into training and test set
    # TODO: fix this splitting sometimes failing when corpus size changes
    train_set, test_set = torch.utils.data.random_split(
        dataset, [
            int(len(dataset) * DATA_SPLIT),
            int(len(dataset) * (1.0 - DATA_SPLIT))
        ])

    # count number of samples in each class
    class_count = [0, 0]
    for data, label in dataset:
        class_count[int(label.item())] += 1

    # get relative weights for classes
    _sum = sum(class_count)
    class_count[0] /= _sum
    class_count[1] /= _sum

    # reverse the weights since we're getting the inverse for the sampler
    class_count = list(reversed(class_count))

    # set weight for every sample
    weights = [class_count[int(x[1].item())] for x in train_set]

    # weighted sampler
    sampler = torch.utils.data.WeightedRandomSampler(
        weights=weights, num_samples=len(train_set), replacement=True)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=32,
                              collate_fn=Sequencer(SEQUENCE_LEN),
                              sampler=sampler)

    test_loader = DataLoader(dataset=test_set,
                             batch_size=32,
                             collate_fn=Sequencer(SEQUENCE_LEN))

    # number of filters in each convolutional filter
    N_FILTERS = 64

    # sizes and number of convolutional layers
    FILTER_SIZES = [2, 3]

    # dropout for between conv and dense layers
    DROPOUT = 0.5

    model = TextCNN(
        embeddings=embedding_vectors,
        n_filters=N_FILTERS,
        filter_sizes=FILTER_SIZES,
        dropout=DROPOUT,
    ).to(device)

    print(model)
    print('Trainable params:',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    EPOCHS = 12

    best_acc = 0.0

    # training loop
    for epoch in range(EPOCHS):
        print('Epoch', epoch + 1)

        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            # get word indices vector and corresponding labels
            x, labels = data

            # send to device
            x = x.to(device)
            labels = labels.to(device)

            # make predictions
            predictions = model(x).squeeze()

            # calculate loss
            loss = criterion(predictions, labels)

            # learning stuff...
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # evaluate
        with torch.no_grad():
            model.eval()

            correct = 0
            wrong = 0
            m = [[0, 0], [0, 0]]

            for data in test_loader:
                x, label = data
                x = x.to(device)

                predictions = model(x).squeeze()

                for truth, prediction in zip(label, predictions):
                    y = int(truth.item())
                    y_pred = 1 if prediction.item() > 0.5 else 0

                    m[y][y_pred] += 1

                    if y == y_pred:
                        correct += 1
                    else:
                        wrong += 1

            model.train()

            acc = correct / (correct + wrong)
            if acc > best_acc:
                best_acc = acc
                for file in glob.glob('models/model_*.pth'):
                    os.remove(file)
                torch.save(model.state_dict(), f'models/state_{epoch}.pth')

            print()
            print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc)
            print('[[TN, FP], [FN, TP]]')
            print(m)
            print()

    # put into evaluation mode
    model.eval()

    text_processor.do_standardize = True

    with torch.no_grad():
        while True:
            text = input('Prompt: ')
            x = text_processor.process(text)
            x = torch.tensor(x).unsqueeze(dim=0)
            print(model(x.to(device)).squeeze())
Esempio n. 45
0
import nltk
from nltk.stem.lancaster import LancasterStemmer  
from nltk.corpus import stopwords, brown
from text_processor import TextProcessor
from keyword_retrieval import KeywordRetrieval
from datetime import datetime

if __name__ == "__main__":
    text = raw_input("Please input your text (It's better not to input more than 15 words)\n>> ")
    # case based ontology
    print "Case: [[ Ontology-based method]]\nText :%s" % text
    # create text processor to get keywords
    start = datetime.now()
    text_processor = TextProcessor(text = text)
    keywords = text_processor.get_keywords()
    if keywords:
        dos = {}
        print "Extracted ontology keywords:", keywords
        for word in keywords:
            kr = KeywordRetrieval(keyword=word)
            dos[word] = kr.get_result()
    else:
        print "The model does not extract any keywords"
    end = datetime.now()
    time_1 = end - start
    for word in keywords:
        if dos[word]:
            # sorted by name's length(similarity)
            for do in sorted(dos[word], key=lambda do: len(do.name)):
                # print do's information 
                print "_"*100
Esempio n. 46
0
 def form_valid(self, form):
     text_processor = TextProcessor()
     form.instance.processed_text = text_processor.process(
         form.cleaned_data['origin_text'])
     return super().form_valid(form)
Esempio n. 47
0
    return  w1  == separator_word or w2 == separator_word or w3 == separator_word

if __name__=='__main__':

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_in']
    dir_out = path['dir_out']
    dir_ale = path['dir_ale']
    dir_pck = path['dir_pck']

    doc_list, parl_tw_list = load_files(dir_in)
    _ ,list_aleatory = load_files(dir_ale)

    tp = TextProcessor()
    tweets = tp.text_process(doc_list,text_only=True)
    tw_words = add_separator(tweets)
    parl_bigrams = get_bigrams(tw_words,3,True)

    #processa os tweets de cada deputado
    parl_processed = list()
    parl_tri_processed = list()
    for l in parl_tw_list:
        temp = add_separator(tp.text_process(l,text_only=True))
        parl_tri_processed.append(get_trigrams(temp, 3, True))
        parl_processed.append(get_bigrams(temp,3,True))

    with open(dir_out+"list_dept_bigrams_.pck", 'wb') as handle:
        pickle.dump(parl_processed, handle)
    with open(dir_out+"list_dept_trigrams_.pck", 'wb') as handle:
Esempio n. 48
0
def main(argv=None):
    """
    Training.
    """

    ### parametres

    LEARNING_RATE = FLAGS.LEARNING_RATE
    NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES
    BATCH_SIZE = FLAGS.BATCH_SIZE
    EPOCH = FLAGS.EPOCH
    TRAINING_DEVICE = FLAGS.TRAINING_DEVICE
    VOCAB_SIZE = FLAGS.VOCAB_SIZE
    NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS
    HIDDEN_SIZE = FLAGS.HIDDEN_SIZE
    INPUT_SIZE = FLAGS.INPUT_SIZE
    NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS
    tsfm = transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    train_corpus = FLAGS.train_corpus
    utils = Utils()
    all_text = utils.output_text(train_corpus)
    text_processor = TextProcessor(freq_threshold=10)
    dictionary = text_processor.vocab_creator(all_text)

    ### Model definition
    encoder = Encoder_LSTM(input_size=INPUT_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS)
    decoder = Decoder_LSTM(input_size=VOCAB_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS,
                           number_of_words=NUMBER_OF_WORDS)
    model_seq_to_seq = Seq2Seq(encoder, decoder).to(device)
    model = model_seq_to_seq

    ### load the state_dict of model if model has been pretrained.
    model.load_state_dict(torch.load(FLAGS.load_weights))

    #### Model Testing
    model.eval()
    from random import randint
    import matplotlib.pyplot as plt

    utils = Utils()

    video_path = FLAGS.video_file

    video_pre_data = utils.video_to_frames(video_path,
                                           frame_number=NUMBER_OF_FRAMES,
                                           device='cuda',
                                           INPUT_SIZE=INPUT_SIZE,
                                           model=md.model_vgg,
                                           transform=tsfm)

    X_2 = torch.zeros([NUMBER_OF_WORDS, VOCAB_SIZE])

    for i in range(NUMBER_OF_WORDS):
        if (i == 0):

            X_2[i][2] = 1
        else:
            X_2[i][1] = 1

    input_data = video_pre_data.unsqueeze(0)

    final_sentence = []

    X_2 = X_2.unsqueeze(0)
    X_2 = X_2.to(device)
    input_data = input_data.to(device)

    for i in range(NUMBER_OF_WORDS - 1):
        with torch.no_grad():
            predicted = model(input_data, X_2)
            predicted = predicted.squeeze(0)

            final_sentence.append(
                next((key for key, value in dictionary.items()
                      if value == torch.argmax(predicted[i])), None))
            X_2[0][i + 1][torch.argmax(predicted[i])] = 1
            X_2[0][i + 1][1] = 0
    print(final_sentence)
Esempio n. 49
0
    #1380844800000  = 04/10/2013, 86400000 = 1 day 
    return 1380844800000+(days*86400000)

def days2timeInterval(day1, day2):
    #1380844800000  = 04/10/2013, 86400000 = 1 day 
    return (1380844800000+(day1*86400000)), (1380844800000+(day2*86400000))

if __name__=='__main__':

    dir_in = "/Users/lucasso/Documents/tweets_pedro/"
    dir_out = "/Users/lucasso/Dropbox/Twitter_Marcelo/Report/plot/"
    excel_path = "/Users/lucasso/Dropbox/Twitter_Marcelo/Arquivo Principal da Pesquisa - Quatro Etapas.xls"
    sheet_name = "amostra"
    col = 4
    rt = ReadTwitter(dir_in, excel_path, sheet_name, col )
    tp = TextProcessor()
   

    id_rep, names = rt.names_from_xls()
    parl_words =  Counter()
    counter_list = list()
    tw_apriori = list()

    for idx in range(len(names)):
        tweets = list()
        data = rt.tweets_election_data(id_rep[idx])

        for k,v in data.items():
            tweets.append(tp.text_process(v.split()))

        tw_apriori += [[x[0] for x in e if x]  for e  in tweets if e ]
Esempio n. 50
0
        with open(dir_in+tw_file) as data_file:
            for line in data_file:
                tweet = json.loads(line)
                temp.append(tweet['text'])
        doc_list.append(temp)
    return doc_list, tw_files


if __name__ == "__main__":
    
    cf = configparser.ConfigParser()
    cf.read("../file_path.properties")
    path = dict(cf.items("file_path"))
    dir_in = path['dir_val']

    tp = TextProcessor()
    tweets = list()
    doc_list, tw_files = load_files(dir_in)
    for txt in doc_list:
        print(len(doc_list))
        tweets.append(tp.text_process(txt, text_only=True))

    for i, fl in enumerate(tw_files):
        f =  open(dir_in+"%s.txt" % fl.split('.')[0], 'w')
        for tw in tweets[i]:
            f.write(" ".join(tw) + "\n")

        f.close()


 def setUp(self):
     self.tp = TextProcessor()
Esempio n. 52
0
embeddings = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

model = TextCNN(
    embeddings=embeddings,
    n_filters=64,
    filter_sizes=[2, 3],
    dropout=0.0,
)

device = torch.device('cpu')
model.load_state_dict(torch.load('model.pth', map_location=device))
model.eval()

text_processing = TextProcessor(
    wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
    tokenizer=get_tokenizer('basic_english'),
    standardize=True,
    min_len=3,
)


@app.post('/game')
async def game(request: Request):
    q = request.form.get('q', None)

    if q is None:
        return HTTPResponse(status=400)

    tokens = text_processing.process(q)
    x = torch.unsqueeze(tokens, dim=0)

    pred = model(x)
class TextProcessorTest(unittest.TestCase):

    tp = None

    def setUp(self):
        self.tp = TextProcessor()

    def validate_phrases(self, phrases, action, args=[], urls=[]):
        has_args = len(args) > 0
        has_urls = len(urls) > 0
        curr_phrase = 0
        curr_url = "google.com"
        for phrase in phrases:
            if has_urls:
                curr_url = urls[curr_phrase]
            web_action = self.tp.process_web_action_request(phrase, curr_url)
            self.assertNotEqual(web_action, None)
            self.assertTrue(web_action[h.CMD] in self.tp.action_text_mappings.keys())
            self.assertEqual(action, web_action[h.CMD])
            print phrase.strip()
            print web_action
            num_args = 0
            if has_args:
                for arg in args[curr_phrase]:
                    if arg in web_action['arguments'].values():
                        num_args += 1
                self.assertEqual(len(args[curr_phrase]), num_args)
            num_nonempty_args = [x for x in web_action['arguments'].values() if x]
            self.assertEqual(num_args, len(num_nonempty_args))
            curr_phrase += 1

    def test_scroll_left(self):
        template_phrases = ["scroll left", "left scroll"]
        self.validate_phrases(template_phrases, h.SCROLL_LEFT)

    def test_scroll_right(self):
        template_phrases = ["scroll right", "right scroll"]
        self.validate_phrases(template_phrases, h.SCROLL_RIGHT)

    def test_scroll_up(self):
        template_phrases = ["scroll up", "up scroll", "scroll up one page", "scroll up ten pages"]
        args = [[1], [1], [1], [10]]
        self.validate_phrases(template_phrases, h.SCROLL_UP, args)

    def test_scroll_down(self):
        template_phrases = ["scroll down", "down scroll", "scroll down one page", "scroll down four pages"]
        args = [[1], [1], [1], [4]]
        self.validate_phrases(template_phrases, h.SCROLL_DOWN, args)

    def test_zoom_in(self):
        template_phrases = ["zoom in by ten percent", "zoom in 20 percent", "zoom", "zoom larger",
                            "zoom in"]
        args = [[10], [20], [25], [25], [25]]
        self.validate_phrases(template_phrases, h.ZOOM_IN, args)

    def test_zoom_out(self):
        template_phrases = ["zoom out by fifty percent", "zoom out by thirty five percent", "zoom away 30 percent",
                            "zoom out 100 percent", "zoom smaller", "zoom out"]
        args = [[50], [35], [30], [100], [25], [25]]
        self.validate_phrases(template_phrases, h.ZOOM_OUT, args)

    def test_open_new_tab(self):
        template_phrases = ["open tab Spotify", "open tab cnn", "open a tab facebook.com",  "new tab google.com",
                            "open a tab", "open a new tab", "new tab", "open new tab", "create tab", "create a new tab",
                            "create new tab"]
        args = [["spotify"], ["cnn"], ['facebook.com'], ['google.com']] + [[] for i in range(len(template_phrases))]
        self.validate_phrases(template_phrases, h.OPEN_TAB, args)

    def test_close_tab(self):
        template_phrases = ["close tab three", "close tab facebook", "exit tab 2", "exit tab StackOverflow", "leave tab twelve", "leave tab google"]
        args = [[3], ["facebook"], [2], ["stackoverflow"], [12], ["google"]]
        self.validate_phrases(template_phrases, h.CLOSE_TAB, args)

    def test_switch_tab(self):
        template_phrases = ["switch to facebook", "switch to four", "switch to tab three", "switch to tab google.com",
                            "switch to Facebook", "change to CNN", "open the twelfth tab", "switch to the first tab", "change to Facebook tab",
                            "change to tab four", "change to Pandora", "change tab to tab 4",
                            "change tab to the weather"]

        args = [["facebook"], [4], [3], ["google.com"], ["facebook"], ["cnn"], [12], [1], ["facebook"], [4], ["pandora"], [4], ["the weather"]]
        self.validate_phrases(template_phrases, h.SWITCH_TAB, args)

    def test_forward_page(self):
        # ["forward", "go forward", "go forward a page",
        template_phrases = ["go to the next page", "next page",
                            "ahead a page", "forward a page", "one page forward", "page forward", "page ahead"]
        self.validate_phrases(template_phrases, h.FORWARD)

    def test_backward_page(self):
        template_phrases = ["back", "backward", "go backward", "go backward a page", "go back a page",
                            "go to the previous page", "previous page", "back a page", "backward a page",
                            "one page backward", "page backward", "page back"]
        self.validate_phrases(template_phrases, h.BACKWARD)

    def test_refresh_page(self):
        template_phrases = ["refresh the page", "refresh page", "page refresh", "refresh this page"]
        self.validate_phrases(template_phrases, h.REFRESH)

    def test_click_element(self):
        template_phrases = ["open link facebook", "click google doc link", "click link github.com",
                            "open link w w w dot google dot com", "click github dot com", "click search",
                            "click the home button", "click submit", "click more", "click sent mail", "click the submit button",
                            "click the post button", "click the home button"]

        args = [['facebook'], ['google doc'], ['github.com'], ['www.google.com'], ['github.com'], ['search'], ['home'],
                ['submit'], ['more'], ['sent mail'], ['submit'], ['post'], ['home']]
        self.validate_phrases(template_phrases, h.CLICK, args)

    def test_open_url(self):
        template_phrases = ["open www.google.com in the current tab", "open facebook.com", "open new google.com",
                            "open accuweather.com in this tab", "open pandora.com in this tab",
                            "open youtube.com in this tab", "open spotify.com in this tab"]
        args = [['www.google.com', 'true'], ['facebook.com', 'false'], ['google.com', 'false'], ['accuweather.com', 'true'], ['pandora.com', 'true'], ['youtube.com', 'true'],
                ['spotify.com', 'true']]
        self.validate_phrases(template_phrases, h.OPEN_URL, args)

    def test_select_element(self):
        template_phrases = ["select search box", "select what are you interested in?", "select username",
                            "select search", "select password", "select search facebook", "select what's on your mind?",
                            "select write a comment..."]
        args = [['search box'], ['what are you interested in?'], ['username'], ['search'], ['password'],
                ['search facebook'], ['what\'s on your mind?'], ["write a comment..."]]
        self.validate_phrases(template_phrases, h.SELECT_ELEMENT, args)

    def test_enter_text(self):
        template_phrases = ["enter senior project has been a long process of testing",
                            "enter text I feel great today for some reason",
                            "write I feel great today and want to go on vacation",
                            "enter text the wheels on the car are worth $2500"]
        args = [['senior project has been a long process of testing'],
                ['i feel great today for some reason'], ['i feel great today and want to go on vacation'],
                ['the wheels on the car are worth $2500']]
        self.validate_phrases(template_phrases, h.ENTER_TEXT, args)

    def test_submit_text(self):
        template_phrases = ["submit text", "submit"]
        args = [[], []]
        self.validate_phrases(template_phrases, h.SUBMIT_TEXT, args)

    def test_open_help(self):
        template_phrases = ["help please", "please help", "open help", "open browsing assistance", "browsing assistance", "assistance", "assistant", "helper", "help window", "help me", "show hints", "open hints", "display hints", "list functions", "list commands", "list actions", "show actions", "show commands"]
        self.validate_phrases(template_phrases, h.OPEN_HELP)

    def test_close_help(self):
        template_phrases = ["close help", "close help page", "hide commands", "hide help", "hide hints", "hide functions", "close browsing assistance"]
        self.validate_phrases(template_phrases, h.CLOSE_HELP)

    # start video context
    def test_play_video(self):
        template_phrases = ["play", "play video", "play movie", "start", "start video", "start movie"]
        urls = [youtube] * len(template_phrases)
        self.validate_phrases(template_phrases, h.PLAY_VIDEO, urls=urls)

    def test_pause_video(self):
        template_phrases = ["stop", "stop video", "stop movie", "stop youtube", "paws", "pause", "paws movie",
                            "paws video", "paws youtube", "pause youtube", "pause video", "pause movie"]
        urls = [youtube] * len(template_phrases)
        self.validate_phrases(template_phrases, h.PAUSE_VIDEO, urls=urls)

    def test_next_video(self):
        template_phrases = ["next", "next video", "next movie", "next video in playlist", "next movie in playlist"]
        urls = [youtube] * len(template_phrases)
        self.validate_phrases(template_phrases, h.NEXT_VIDEO, urls=urls)

    def test_open_fullscreen(self):
        template_phrases = ["fullscreen", "full screen", "open fullscreen", "open full screen", "toggle fullscreen",
                            "toggle full screen"]
        urls = [youtube] * len(template_phrases)
        self.validate_phrases(template_phrases, h.OPEN_FULLSCREEN, urls=urls)

    def test_close_fullscreen(self):
        template_phrases = ["close", "exit", "escape", "quit", "quit fullscreen", "close fullscreen",
                            "close full screen", "exit fullscreen", "exit full screen", "toggle fullscreen off",
                            "toggle full screen off"]
        urls = [youtube] * len(template_phrases)
        self.validate_phrases(template_phrases, h.CLOSE_FULLSCREEN, urls=urls)

    # start music context
    def test_play_music(self):
        template_phrases = ["play", "start", "play music", "play my music", "play song", "play tune", "start music",
                            "start song", "start tune"]
        args = [['true'], ['false'], ['true'], ['false'], ['false'], ['false'], ['true'], ['false'], ['true']]
        urls = [spot, pandora, spot, pandora, pandora, pandora, spot, pandora, spot]
        self.validate_phrases(template_phrases, h.PLAY_MUSIC, args, urls=urls)

    def test_pause_music(self):
        template_phrases = ["pause", "pause music", "paws music", "paws", "paws song", "stop", "stop music",
                            "stop my music", "stop song", "stop tune"]
        args = [['true'], ['false'], ['true'], ['false'], ['false'], ['false'], ['false'], ['true'], ['false'], ['true']]
        urls = [spot, pandora, spot, pandora, pandora, pandora, pandora,  spot, pandora, spot]
        self.validate_phrases(template_phrases, h.PAUSE_MUSIC, args, urls=urls)

    def test_next_song(self):
        template_phrases = ["next", "next song", "next tune", "next on playlist", "next in playlist"]
        args = [['true'], ['false'], ['true'], ['false'], ['false'], ['true']]
        urls = [spot, pandora, spot, pandora, pandora, spot]
        self.validate_phrases(template_phrases, h.NEXT_SONG, args, urls=urls)

    def test_search_music(self):
        template_phrases = ["search artist Elvis Presley", "search artist Led Zeppelin", "search for artist red hot chili peppers",
                            "search album the song remains the same", "search for album by the way",
                            "search song star-spangled banner", "search for song Whole Lotta Love",
                            "search for song one"]
        args = [['false', 'elvis presley', 'artist'], ['true', 'led zeppelin', 'artist'], ['false', 'red hot chili peppers', 'artist'],
                ['false', 'the song remains the same', 'album'], ['true', 'by the way', 'album'],
                ['true', 'star-spangled banner', 'song'], ['true', 'whole lotta love', 'song'],
                ['false', 'one', 'song']]
        urls = [pandora, spot, pandora, pandora, spot, spot, spot, pandora]
        self.validate_phrases(template_phrases, h.SEARCH_MUSIC, args, urls=urls)

    # start doc context
    def test_go_to_page_pdf(self):
        template_phrases = ["go to page four hundred five", "go to page sixty seven", "go to two thousand seven hundred fifty three"]
        args = [[405], [67], [2753]]
        urls = [pdf] * 3
        self.validate_phrases(template_phrases, h.GO_TO_PDF_PAGE, args, urls=urls)
Esempio n. 54
0
    CLASS_WEIGHT = args.class_weight
    N_ESTIMATORS = int(args.estimators)
    LOSS_FUN = args.loss
    KERNEL = args.kernel
    
    print('Word2Vec embedding: %s' %(W2VEC_MODEL_FILE))
    print('Embedding Dimension: %d' %(EMBEDDING_DIM))
    
    cf = configparser.ConfigParser()
    cf.read("../file_path.properties")
    path = dict(cf.items("file_path"))
    dir_w2v = path['dir_w2v']
    dir_in = path['dir_in']

    word2vec_model = gensim.models.Word2Vec.load(dir_w2v+W2VEC_MODEL_FILE)
    tp = TextProcessor()
    doc_list, tw_class = load_files(dir_in)
    tweets = tp.text_process(doc_list, text_only=True)
    tweets = select_tweets(tweets)

    X, Y = gen_data(tweets, tw_class)

    model = classification_model(X, Y, MODEL_TYPE)
    joblib.dump(model, dir_in + MODEL_TYPE + '.skl')


    # python BoWV.py --model logistic --seed 42 -f model_word2vec -d 100 --folds 10
    # python BoWV.py --model gradient_boosting --seed 42 -f model_word2vec -d 100 --loss deviance --folds 10
    # python BoWV.py --model random_forest --seed 42 -f model_word2vec -d 100 --estimators 20 --folds 10
    # python BoWV.py --model svm_linear --seed 42 -f model_word2vec -d 100 --loss squared_hinge --folds 10
    # python BoWV.py --model svm --seed 42 -f model_word2vec -d 100 --kernel rbf --folds 10
Esempio n. 55
0
if __name__=='__main__':

cf = configparser.ConfigParser()
cf.read("file_path.properties")
path = dict(cf.items("file_path"))
dir_in = path['dir_in']
dir_out = path['dir_out']
dir_ale = path['dir_ale']
dir_pck = path['dir_pck']


doc_list, parl_tw_list = load_files(dir_in)
_ ,list_aleatory = load_files(dir_ale)

tp = TextProcessor()
tweets = tp.text_process(doc_list, text_only=True)

parl_tw_processed = list()
for l in parl_tw_list:
    parl_tw_processed.append(tp.text_process(l, text_only=True))

alea_tw_processed = list()
for l in list_aleatory:
    alea_tw_processed.append(tp.text_process(l, text_only=True))

for i,l in enumerate(alea_tw_processed):
    alea_tw_processed[i] = [n for n in l if n]

with open(dir_out+"bgr_tfidf_like.pck",'rb') as handle:
    parl_bigrams = pickle.load(handle)