def text_iterator(use_wiki, wiki_location, use_qb, qb_location, use_source, source_location, limit=-1, min_pages=0, country_list=COUNTRY_LIST_PATH): if isinstance(qb_location, str): qdb = QuestionDatabase(qb_location) else: qdb = qb_location doc_num = 0 cw = CachedWikipedia(wiki_location, data_path(country_list)) pages = qdb.questions_with_pages() for p in sorted(pages, key=lambda k: len(pages[k]), reverse=True): # This bit of code needs to line up with the logic in qdb.py # to have the same logic as the page_by_count function if len(pages[p]) < min_pages: continue if use_qb: train_questions = [x for x in pages[p] if x.fold == "train"] question_text = "\n".join(" ".join(x.raw_words()) for x in train_questions) else: question_text = '' if use_source: filename = '%s/%s' % (source_location, p) if os.path.isfile(filename): try: with gzip.open(filename, 'rb') as f: source_text = f.read() except zlib.error: log.info("Error reading %s" % filename) source_text = '' else: source_text = '' else: source_text = u'' if use_wiki: wikipedia_text = cw[p].content else: wikipedia_text = u"" total_text = wikipedia_text total_text += "\n" total_text += question_text total_text += "\n" total_text += str(source_text) yield p, total_text doc_num += 1 if 0 < limit < doc_num: break
def train_classifier(class_type, question_db=None): if question_db is None: question_db = QuestionDatabase(QB_QUESTION_DB) log.info("Training classifier: {}".format(class_type)) all_questions = question_db.questions_with_pages() train = compute_features(all_questions, 'train', class_type) train_x = train['text'] train_y = train['label'] classifier = pipeline_creators[class_type]().fit(train_x, train_y) return classifier
def initialize_cache(path): """ This function iterates over all pages and accessing them in the cache. This forces a prefetch of all wiki pages """ db = QuestionDatabase(QB_QUESTION_DB) pages = db.questions_with_pages() cw = CachedWikipedia(path) pool = Pool() input_data = [(format_guess(title), cw) for title in pages.keys()] pool.starmap(access_page, input_data)
def web_initialize_file_cache(path, remote_delay=1): """ Initialize the cache by requesting each page with wikipedia package. This function iterates over all pages and accessing them in the cache. This forces a prefetch of all wiki pages """ db = QuestionDatabase() pages = db.questions_with_pages() cw = CachedWikipedia(path, remote_delay=remote_delay) pool = Pool() input_data = [(title, cw) for title in pages.keys()] pool.starmap(access_page, input_data)
def preprocess_titles(): # stop_words = set(stopwords.words('english')) titles_file = open('data/titles-sorted.txt') db = QuestionDatabase() pages = {format_guess(page) for page in db.questions_with_pages().keys()} with open('data/processed-titles-sorted.txt', 'w') as f: for line in titles_file: page = format_guess(line.strip().lower()) # if len(page) > 2 and re.match(r"^[a-zA-Z0-9_()']+$", page)\ # and page not in stop_words and page[0].isalnum(): if page in pages: f.write(line.strip().lower()) else: f.write('@') f.write('\n') titles_file.close()
def wikify(output_directory): database = QuestionDatabase(QB_QUESTION_DB) pages = database.questions_with_pages() total = 0 for p in pages: if len(pages[p]) >= conf['wikifier']['min_appearances']: log.info('{} {}'.format(p, len(pages[p]))) for q in pages[p]: total += 1 for sentence, word, text in q.partials(): sentence -= 1 with open( "%s/%i-%i.txt" % (output_directory, q.qnum, sentence), 'w') as output: output.write("%s\n" % text[sentence]) log.info(str(total))
def __init__(self): super(StatsExtractor, self).__init__() with open(SENTENCE_STATS, 'rb') as f: self.word_count_mean, self.word_count_std = pickle.load(f) self.guess_frequencies = {} question_db = QuestionDatabase(QB_QUESTION_DB) all_questions = question_db.questions_with_pages() for page in all_questions: self.guess_frequencies[page] = sum(1 for x in all_questions[page] if x.fold == "train") self.frequency_mean = np.mean(list(self.guess_frequencies.values())) self.frequency_std = np.std(list(self.guess_frequencies.values())) for page in all_questions: normalized_frequency = normalize(self.guess_frequencies[page], self.frequency_mean, self.frequency_std) self.guess_frequencies[page] = normalized_frequency self.normed_missing_guess = normalize(0, self.frequency_mean, self.frequency_std)
def create_report(classifier, class_type, question_db=None): if question_db is None: question_db = QuestionDatabase(QB_QUESTION_DB) all_questions = question_db.questions_with_pages() train = compute_features(all_questions, 'train', class_type) train_x = train['text'] train_y = train['label'] dev = compute_features(all_questions, 'dev', class_type) dev_x = dev['text'] dev_y = dev['label'] train_score = classifier.score(train_x, train_y) dev_score = classifier.score(dev_x, dev_y) true_labels = dev['label'].values predicted_labels = classifier.predict(dev_x) cf_norm = '/tmp/norm_confusion.png' plot_confusion( 'Row Normalized Confusion Matrix of {} Classification'.format( class_type), true_labels, predicted_labels, normalized=True) plt.savefig(cf_norm, format='png', dpi=200) plt.clf() plt.cla() plt.close() cf_unnorm = '/tmp/unnorm_confusion.png' plot_confusion('Unnormalized Confusion Matrix of {} Classification'.format( class_type), true_labels, predicted_labels, normalized=False) plt.savefig(cf_unnorm, format='png', dpi=200) correct_by_position = '/tmp/correct_by_position.png' dev['prediction'] = pd.Series(predicted_labels) dev['correct'] = dev['prediction'] == dev['label'] pd.pivot_table(dev, values=['text'], index=['sentence', 'correct'], aggfunc=lambda x: len(x)).unstack(fill_value=0).plot.bar( title='Number of Questions Correct vs Sentence Number') plt.xlabel('Sentence Number') plt.ylabel('Number Correct') handles, labels = plt.gca().get_legend_handles_labels() plt.gca().legend(handles, ['Number Incorrect', 'Number Correct']) plt.savefig(correct_by_position, format='png', dpi=200) report = ReportGenerator( { 'unnormalized_confusion_plot': cf_unnorm, 'normalized_confusion_plot': cf_norm, 'correct_by_position_plot': correct_by_position, 'train_score': train_score, 'dev_score': dev_score, 'class_type': class_type }, 'classifier.md') output = safe_path(CLASSIFIER_REPORT_PATH.format(class_type)) report.create(output) plt.clf() plt.cla() plt.close()