def create_dataset():
    classifier = ClassifierSimple()
    for filename in os.listdir(UPLOAD_FOLDER):
        if filename.endswith('.pdf'):
            importer = ImporterTeambeam()
            paper = importer.import_paper(filename)
            text_processing = TextProcessor()

            text_processing.proceed_paper(paper)
            chapter_names = [name.heading for name in paper.sections if
                             not (name.heading.isspace() or name.heading is '')]

            if not len(chapter_names):
                continue

            prob = classifier.predict_chapter(chapter_names)

            for i in range(len(prob)):
                tmp = ""
                if prob[i][IMRaDType.ABSTRACT.value] == 1:
                    tmp += IMRaDType.ABSTRACT.name + " "
                if prob[i][IMRaDType.INTRODUCTION.value] == 1:
                    tmp += IMRaDType.INTRODUCTION.name + " "
                if prob[i][IMRaDType.BACKGROUND.value] == 1:
                    tmp += IMRaDType.BACKGROUND.name + " "
                if prob[i][IMRaDType.RESULTS.value] == 1:
                    tmp += IMRaDType.RESULTS.name + " "
                if prob[i][IMRaDType.DISCUSSION.value] == 1:
                    tmp += IMRaDType.DISCUSSION.name + " "
                if prob[i][IMRaDType.ACKNOWLEDGE.value] == 1:
                    tmp += IMRaDType.ACKNOWLEDGE.name + " "

                if tmp is not "":
                    print("{0}: {1}".format(chapter_names[i], tmp))
Exemple #2
0
class Preprocessor(object):
    def __init__(self):
        self.imrad_detector = IMRaDDetection()
        self.text_processor = TextProcessor()
        self.client = DBClient()

    def __add_paper_to_reference(self, paper1, paper2):
        if not paper2.title_proceed:
            return

        for ref in paper1.references:
            similarity = SequenceMatcher(None, ref.complete_ref_raw.lower(),
                                         paper2.title_raw.lower()).ratio()
            if similarity >= REFERENCE_SIMULARITY_THRESHOLD:
                ref.paper_id = [paper2.id, "automated"]
                self.client.update_paper(paper1)

    def proceed_paper(self, paper):
        self.text_processor.proceed_paper(paper)
        self.imrad_detector.proceed(paper)

    def proceed_queries(self, queries):
        queries_proceed = {} if "whole-document" in queries else {
            "whole-document": ""
        }
        for imrad_type, query in queries.items():
            queries_proceed[imrad_type] = self.text_processor.proceed_string(
                query)
        return queries_proceed

    def link_references(self, new_paper):
        for paper in self.client.get_all_paper():
            self.__add_paper_to_reference(paper, new_paper)
            self.__add_paper_to_reference(new_paper, paper)