def read_in_paper(filename, sentences_as_lists=False, preserve_order=False):
    """
    Reads in a paper and returns it as a dictionary.
    :param filename: the filename of the paper to read, of the paper file itself only not the path to the paper.
    :param sentences_as_lists: if true, will return the sentences of the paper as lists of words rather than strings.
    :param preserve_order: if true keeps track of which sections occured in what order in the paper.
    :return: a dictionary of the form (section: list of sentences in that section)
    """
    paper_to_open = PAPER_SOURCE + filename
    paper_text = Reader().open_file_single_string(paper_to_open)
    udata = paper_text.decode("utf-8")
    paper = udata.encode("ascii", "ignore")
    return paper_tokenize(paper, sentences_as_lists=sentences_as_lists, preserve_order=preserve_order)
예제 #2
0
def read_in_files():
    """Function which reads in all the scientific paper data, and parses it into a list of lists, where each item in
       the list is a sentence, in the form of a list of words. This is the form needed for the Word2Vec model."""

    num_files = len(
        [name for name in os.listdir(DATA_SOURCE) if name.endswith(".txt")])
    loading_section_size = num_files / 30
    count = 0

    sentences_as_lists = []
    for filename in os.listdir(DATA_SOURCE):
        if filename.endswith(".txt"):

            # Pretty loading bar
            print("Processing Files: [", end="")
            for i in range(31, -1, -1):
                if count > i * loading_section_size:
                    for j in range(0, i):
                        print("-", end="")
                        sys.stdout.flush()
                    for j in range(i, 30):
                        print(" ", end="")
                        sys.stdout.flush()
                    break
            if count == num_files:
                print("] ", count, end="\n")
            else:
                print("] ", count, end="\r")
            sys.stdout.flush()

            # Open the paper
            paper_to_open = DATA_SOURCE + filename
            paper = Reader().open_file_single_string(paper_to_open)
            udata = paper.decode("utf-8")
            paper = udata.encode("ascii", "ignore")

            # Split the data into a list of sentences, where each sentence is a list of words
            sentences = sent_tokenize(paper)

            for sentence in sentences:
                words = word_tokenize(sentence)
                sentences_as_lists.append(words)

            if DEBUG:
                print(sentences_as_lists)
                #wait()

            count += 1

    return count, sentences_as_lists