def main():
    argparser = argparse.ArgumentParser(description=("Clean the Quora dataset "
                                                     "by removing newlines in "
                                                     "the data."))
    argparser.add_argument("dataset_input_path",
                           type=str,
                           help=("The path to the raw Quora "
                                 "dataset to clean."))
    argparser.add_argument("dataset_output_path",
                           type=str,
                           help=("The *folder* to write the "
                                 "cleaned file to. The name will just have "
                                 "_cleaned appended to it, before the "
                                 "extension"))
    config = argparser.parse_args()

    clean_rows = []
    with open(config.dataset_input_path, 'r') as f:
        reader = csv.DictReader(f,
                                fieldnames=[
                                    'id', 'qid1', 'qid2', 'question1',
                                    'question2', 'is_duplicate'
                                ])
        for i, row in enumerate(reader):
            if i > 0:
                question1 = get_cleaned_text(row['question1'])
                question2 = get_cleaned_text(row['question2'])

                clean_rows.append([
                    row['id'], row['qid1'], row['qid2'], question1, question2,
                    row['is_duplicate']
                ])

    input_filename_full = os.path.basename(config.dataset_input_path)
    input_filename, input_ext = os.path.splitext(input_filename_full)
    out_path = os.path.join(config.dataset_output_path,
                            input_filename + "_cleaned" + input_ext)

    with open(out_path, "w") as f:
        writer = csv.DictWriter(f,
                                fieldnames=[
                                    'id', 'qid1', 'qid2', 'question1',
                                    'question2', 'is_duplicate'
                                ])
        writer.writeheader()

        for row in clean_rows:
            writer.writerow({
                'id': row[0],
                'qid1': row[1],
                'qid2': row[2],
                'question1': row[3],
                'question2': row[4],
                'is_duplicate': row[5]
            })
Exemple #2
0
 def __iter__(self):
     """
     Iterator for the document set stored in the DB.
     """
     for pageid in self.get_page_ids():
         page_id = pageid[0]
         page_content = self.get_page_by_id(page_id)
         clean_text = utils.get_cleaned_text(page_content).split()
         yield clean_text
Exemple #3
0
    def get_cleaned_pages(self):
        pages = list()

        for pageid in self.get_page_ids():
            page_id = pageid[0]
            page_content = self.get_page_by_id(page_id)
            clean_text = utils.get_cleaned_text(page_content).split()
            pages.append(clean_text)

        return pages
Exemple #4
0
    def get_cleaned_pages(self):
        pages = list()

        for page_hash in self.get_page_hashes():
            # page_id = pageid[0]
            # print("###################")
            # print('page_hash={}'.format(page_hash))
            page_content = self.get_page_by_hash(page_hash[0])
            clean_text = utils.get_cleaned_text(page_content).split()
            pages.append(clean_text)

        return pages
Exemple #5
0
def do_content_prediction(test_data,
                          num_classes,
                          max_feature_length,
                          ckpt_dir=None):

    # Load Testing Data
    comments = []
    comments_ids = []
    with open(test_data, 'r') as f:
        for comment in f.readlines():
            comments.append(comment)
            comment = get_cleaned_text(comment,
                                       emotes,
                                       remove_stopwords=True,
                                       streamer=FLAGS.streamer,
                                       remove_emotes_or_words=False,
                                       digit_to_string=False)
            comments_ids.append(get_comment_ids(comment, max_feature_length))

    X = np.asarray(comments_ids, dtype='int32')
    ckpt_dir = os.path.join(
        ckpt_dir, 'content') if ckpt_dir is not None else os.path.join(
            'checkpoints', 'content')

    model = build_model(num_filters=num_filters,
                        num_classes=num_classes,
                        sequence_max_length=FLAGS.max_feature_length,
                        embedding_size=FLAGS.embedding_size,
                        learning_rate=FLAGS.learning_rate,
                        load_pretrained_model=True,
                        ckpt_dir=ckpt_dir)

    predictions = model.predict(X, batch_size=FLAGS.batch_size, verbose=0)
    predictions = np.argmax(predictions, axis=1)

    return predictions
Exemple #6
0
# print( help( corpora.dictionary ) )
should_rebuild = False

# ### Dictionary ###
dict_file = utils.get_file_path(cfg.DICT_BACKUP)
# dictionary = corpora.dictionary.Dictionary.load(dict_file)
dictionary = utils.build_dictionary(content, should_rebuild, cfg.DICT_BACKUP)

# ### Corpus ###
corpus_file = utils.get_file_path(cfg.CORPUS_BACKUP)
# utils.pickle_save(corpus_file, corpus)
# corpus = corpora.MmCorpus(corpus_file)
corpus = utils.build_corpus(dictionary, content, should_rebuild,
                            cfg.CORPUS_BACKUP)
# corpus = pickle.load( open( corpus_file, "rb" ) )

# print( cfg.MODEL_NAME )

# ### LDA Model ###
bow = dictionary.doc2bow(utils.get_cleaned_text(query.lower()).split())
# bag_of_words = [word for word in bow]
model = utils.build_model(dictionary, corpus, should_rebuild)
q_vec = model[bow]  # "query vector"
# topic_details = list()
topic_details = model.print_topic(max(q_vec, key=lambda item: item[1])[0])

print('Dictionary Size = {}'.format(len(dictionary)))
print('Corpus Size = {}'.format(len(corpus)))
print('Topic Details: ')
print(topic_details)