def read_in_paper(filename, sentences_as_lists=False, preserve_order=False):
    """
    Reads in a paper and returns it as a dictionary.
    :param filename: the filename of the paper to read, of the paper file itself only not the path to the paper.
    :param sentences_as_lists: if true, will return the sentences of the paper as lists of words rather than strings.
    :param preserve_order: if true keeps track of which sections occured in what order in the paper.
    :return: a dictionary of the form (section: list of sentences in that section)
    """
    paper_to_open = PAPER_SOURCE + filename
    paper_text = Reader().open_file_single_string(paper_to_open)
    udata = paper_text.decode("utf-8")
    paper = udata.encode("ascii", "ignore")
    return paper_tokenize(paper, sentences_as_lists=sentences_as_lists, preserve_order=preserve_order)
Ejemplo n.º 2
0
def read_in_files():
    """Function which reads in all the scientific paper data, and parses it into a list of lists, where each item in
       the list is a sentence, in the form of a list of words. This is the form needed for the Word2Vec model."""

    num_files = len(
        [name for name in os.listdir(DATA_SOURCE) if name.endswith(".txt")])
    loading_section_size = num_files / 30
    count = 0

    sentences_as_lists = []
    for filename in os.listdir(DATA_SOURCE):
        if filename.endswith(".txt"):

            # Pretty loading bar
            print("Processing Files: [", end="")
            for i in range(31, -1, -1):
                if count > i * loading_section_size:
                    for j in range(0, i):
                        print("-", end="")
                        sys.stdout.flush()
                    for j in range(i, 30):
                        print(" ", end="")
                        sys.stdout.flush()
                    break
            if count == num_files:
                print("] ", count, end="\n")
            else:
                print("] ", count, end="\r")
            sys.stdout.flush()

            # Open the paper
            paper_to_open = DATA_SOURCE + filename
            paper = Reader().open_file_single_string(paper_to_open)
            udata = paper.decode("utf-8")
            paper = udata.encode("ascii", "ignore")

            # Split the data into a list of sentences, where each sentence is a list of words
            sentences = sent_tokenize(paper)

            for sentence in sentences:
                words = word_tokenize(sentence)
                sentences_as_lists.append(words)

            if DEBUG:
                print(sentences_as_lists)
                #wait()

            count += 1

    return count, sentences_as_lists
def read_stopwords():
    """
    Reads the list of stop words that should not be included in the scoring of each sentence such as \"and\"
    and \"or\".
    """
    common_words = Reader().open_file(STOPWORD_SOURCE) + list(string.punctuation)
    return set(common_words)
Ejemplo n.º 4
0
def read_definite_non_summary_section_titles():
    """
    Reads the list of sections titles from which summary statements are very rare to come from.
    :return: the list of such section titles.
    """
    return set(Reader().open_file(
        BASE_DIR + "/Data/Utility_Data/definite_non_summary_titles.txt"))
Ejemplo n.º 5
0
def read_section_titles():
    """
    Reads the section titles permitted in the paper.
    :return: A set of the permitted titles.
    """
    return set(Reader().open_file(BASE_DIR +
                                  "/Data/Utility_Data/permitted_titles.txt"))
def paper_tokenize(text, sentences_as_lists=False, preserve_order=False):
    """
    Takes a paper with the sections delineated by '@&#' and splits them into a dictionary where the key is the section
    and the value is the text under that section. This could probably be a bit more efficient but it works well enough.
    :param text: the text of the paper to split
    :param sentences_as_lists: if true, returns the text of each sentence as a list of sentences.
    :param preserve_order: if true, tracks the order in which the paper sections occured.
    :returns: a dictionary of the form (section: section_text)
    """
    permitted_titles = set(Reader().open_file(PERMITTED_TITLES_SOURCE))

    # Split the text into sections
    if preserve_order:
        split_text_1 = re.split("@&#", text)
        split_text = zip(split_text_1, range(len(split_text_1)))
    else:
        split_text = re.split("@&#", text)

    # The key value. This value is changed if a permitted section title is encountered in the list.
    state = ""

    # After the for loop, this dictionary will have keys relating to each permitted section, and values corresponding
    # to the text of that section
    sentences_with_states = defaultdict(str)

    section_counts = defaultdict(int)

    if preserve_order:
        for text, pos in split_text:

            # Hack for proper sentence tokenization because NLTK tokeniser doesn't work properly for tokenising papers
            text = text.replace("etal.", "etal")
            text = text.replace("et al.", "etal")
            text = text.replace("Fig.", "Fig")
            text = text.replace("fig.", "fig")
            text = text.replace("Eq.", "Eq")
            text = text.replace("eq.", "eq")
            text = text.replace("pp.", "pp")
            text = text.replace("i.e.", "ie")
            text = text.replace("e.g.", "eg")
            text = text.replace("ref.", "ref")
            text = text.replace("Ref.", "Ref")
            text = text.replace("etc.", "etc")
            text = text.replace("Figs.", "Figs")
            text = text.replace("figs.", "figs")
            text = text.replace("No.", "No")
            text = text.replace("eqs.", "eqs")

            # Checks if text is a section title
            if text.lower() in permitted_titles:
                state = text
                section_counts[state] += 1
            else:
                if sentences_as_lists:
                    if section_counts[state] > 1:
                        state = state + "_" + str(section_counts[state])
                    sentences_with_states[state] = ([preprocess_sentence(x) for x in sent_tokenize(text)], pos)
                else:
                    if section_counts[state] > 1:
                        state = state + "_" + str(section_counts[state])
                    sentences_with_states[state] = (text, pos)

    if not preserve_order:
        for text in split_text:

            # Hack for proper sentence tokenization because NLTK tokeniser doesn't work properly for tokenising papers
            text = text.replace("etal.", "etal")
            text = text.replace("et al.", "etal")
            text = text.replace("Fig.", "Fig")
            text = text.replace("fig.", "fig")
            text = text.replace("Eq.", "Eq")
            text = text.replace("eq.", "eq")
            text = text.replace("pp.", "pp")
            text = text.replace("i.e.", "ie")
            text = text.replace("e.g.", "eg")
            text = text.replace("ref.", "ref")
            text = text.replace("Ref.", "Ref")
            text = text.replace("etc.", "etc")
            text = text.replace("Figs.", "Figs")
            text = text.replace("figs.", "figs")
            text = text.replace("No.", "No")
            text = text.replace("eqs.", "eqs")

            # Checks if text is a section title
            if text.lower() in permitted_titles:
                state = text
                section_counts[state] += 1
            else:
                if sentences_as_lists:
                    if section_counts[state] > 1:
                        state = state + "_" + str(section_counts[state])
                    sentences_with_states[state] = [preprocess_sentence(x) for x in sent_tokenize(text)]
                else:
                    if section_counts[state] > 1:
                        state = state + "_" + str(section_counts[state])
                    sentences_with_states[state] = text

    return sentences_with_states
# Location of the file with stopwords in it
STOPWORD_SOURCE = BASE_DIR + "/Data/Utility_Data/common_words.txt"

# Location of the Word2Vec model
MODEL_SOURCE = BASE_DIR + "/Word2Vec/Word2Vec_Models/summarisation_100features_5minwords_20context"

# Location to write the global filecounts of bag of words for each paper and total word count
GLOBAL_WORDCOUNT_WRITE_LOC = BASE_DIR + "/Data/Utility_Data/Global_Counts/"

# This may not be needed anymore
DD_CLASSIFICATION_MODEL_LOC_VECS = BASE_DIR + "/Trained_Models/DoubleDataset/lr_c2_5_vector.pkl"
DD_CLASSIFICATION_MODEL_LOC_FEATS = BASE_DIR + "/Trained_Models/DoubleDataset/lr_c2_5_feature.pkl"
W2V_CLASSIFICATION_MODEL_LOC = BASE_DIR + "/Trained_Models/Word2Vec_Classifier/lr_highlights_only_c3_66.pkl"

# Reads the stopwords as a set
STOPWORDS = set(Reader().open_file(STOPWORD_SOURCE) + list(string.punctuation))

# Counts how many papers there are in the paper directory
NUMBER_OF_PAPERS = len([name for name in os.listdir(PAPER_SOURCE) if name.endswith(".txt")])

# Integers defining the possible location of sentences in the paper
HIGHLIGHT = 0
ABSTRACT = 1
INTRODUCTION = 2
RESULT_DISCUSSION = 3
METHOD = 4
CONCLUSION = 5
OTHER = 6

# An object which can compare two sentences and tell how similar they are / if they are the same
SENTENCE_COMPARATOR_OBJ = SentenceComparator()