import re
import local_util as u
logger = u.get_logger(__name__)  # will call setup_logging() if necessary

import shelve
import argparse
import os
import bs4
import gzip

import article_content


class ArticleReader():
    def __init__(self, dbname, aquaint, aquaint2):
        self.dbname = dbname
        self.AQUAINT_DIR = aquaint
        self.AQUAINT2_DIR = aquaint2
        self.ENG_GW = '/opt/dropbox/17-18/573/ENG-GW'

    def __aquaint_filename__(self, doc_id):
        if doc_id[3:8] == '_ENG_':  # True for AQUAINT-2 or ENG-GW files
            filename = '%s/data/%s/%s.xml' % (
                self.AQUAINT2_DIR, doc_id[0:7].lower(),
                doc_id[0:doc_id.find('.') - 2].lower())
            # If that file isn't there, look in ENG-GW
            if not os.path.exists(filename):
                filename = '%s/data/%s/%s.gz' % (
                    self.ENG_GW, doc_id[0:7].lower(),
                    doc_id[0:doc_id.find('.') - 2].lower())
        else:
Beispiel #2
0
# qrmatrix.py.d3b, attempt to recreate qrmatrix.py.d3err
# qrmatrix.py.d3_orig, ROUGE-1, 0.22264, 0.25731, 0.23795
# this is the official D3 version.
# This function taks a list of documents (from class) and writes a single file to summary
import local_util as u
logger = u.get_logger(
    __name__)  #  https://docs.python.org/3/howto/logging.html

import os
import sum_config
import re  # for removing multiple \s characters and source formatting
import math  # for exp for weighting function
from operator import itemgetter
import sys

import preprocess
from nltk.tokenize import sent_tokenize, word_tokenize  # for tokenizing sentences and words

from scipy import spatial


# tally: # times word appears in document
# ac: number of articles (total article count)
# dc: number of documents the word appears in
def get_tfidf(tally, ac, dc):
    base = 10
    return tally * ((base / (1 + dc)) + 1 + math.log10(ac / (1 + dc)))


# tf * document frequency
# Muliply by -1 so that it's a positive number (dc is always less than ac+1)
import nltk
from nltk.util import ngrams
import re
import math
import argparse
import os
import fnmatch
import time
from collections import Counter

import article_content
import sum_config
import topic_index_reader

import local_util as u
logger = u.get_logger(
    'sentence_distance.py')  # will call setup_logging() if necessary


def sentence_tokens_with_alpha_only(sentence):
    return [
        t for t in nltk.word_tokenize(sentence.lower())
        if re.search("[a-z]", t)
    ]


def reverse_jaccard_distance_value(tokens1, tokens2):
    return 1.0 - nltk.jaccard_distance(tokens1, tokens2)


def make_ngrams(tokens, n):
    return (list(ngrams(tokens, n)))