import re import local_util as u logger = u.get_logger(__name__) # will call setup_logging() if necessary import shelve import argparse import os import bs4 import gzip import article_content class ArticleReader(): def __init__(self, dbname, aquaint, aquaint2): self.dbname = dbname self.AQUAINT_DIR = aquaint self.AQUAINT2_DIR = aquaint2 self.ENG_GW = '/opt/dropbox/17-18/573/ENG-GW' def __aquaint_filename__(self, doc_id): if doc_id[3:8] == '_ENG_': # True for AQUAINT-2 or ENG-GW files filename = '%s/data/%s/%s.xml' % ( self.AQUAINT2_DIR, doc_id[0:7].lower(), doc_id[0:doc_id.find('.') - 2].lower()) # If that file isn't there, look in ENG-GW if not os.path.exists(filename): filename = '%s/data/%s/%s.gz' % ( self.ENG_GW, doc_id[0:7].lower(), doc_id[0:doc_id.find('.') - 2].lower()) else:
# qrmatrix.py.d3b, attempt to recreate qrmatrix.py.d3err # qrmatrix.py.d3_orig, ROUGE-1, 0.22264, 0.25731, 0.23795 # this is the official D3 version. # This function taks a list of documents (from class) and writes a single file to summary import local_util as u logger = u.get_logger( __name__) # https://docs.python.org/3/howto/logging.html import os import sum_config import re # for removing multiple \s characters and source formatting import math # for exp for weighting function from operator import itemgetter import sys import preprocess from nltk.tokenize import sent_tokenize, word_tokenize # for tokenizing sentences and words from scipy import spatial # tally: # times word appears in document # ac: number of articles (total article count) # dc: number of documents the word appears in def get_tfidf(tally, ac, dc): base = 10 return tally * ((base / (1 + dc)) + 1 + math.log10(ac / (1 + dc))) # tf * document frequency # Muliply by -1 so that it's a positive number (dc is always less than ac+1)
import nltk from nltk.util import ngrams import re import math import argparse import os import fnmatch import time from collections import Counter import article_content import sum_config import topic_index_reader import local_util as u logger = u.get_logger( 'sentence_distance.py') # will call setup_logging() if necessary def sentence_tokens_with_alpha_only(sentence): return [ t for t in nltk.word_tokenize(sentence.lower()) if re.search("[a-z]", t) ] def reverse_jaccard_distance_value(tokens1, tokens2): return 1.0 - nltk.jaccard_distance(tokens1, tokens2) def make_ngrams(tokens, n): return (list(ngrams(tokens, n)))