def create_dataset():
    classifier = ClassifierSimple()
    for filename in os.listdir(UPLOAD_FOLDER):
        if filename.endswith('.pdf'):
            importer = ImporterTeambeam()
            paper = importer.import_paper(filename)
            text_processing = TextProcessor()

            text_processing.proceed_paper(paper)
            chapter_names = [name.heading for name in paper.sections if
                             not (name.heading.isspace() or name.heading is '')]

            if not len(chapter_names):
                continue

            prob = classifier.predict_chapter(chapter_names)

            for i in range(len(prob)):
                tmp = ""
                if prob[i][IMRaDType.ABSTRACT.value] == 1:
                    tmp += IMRaDType.ABSTRACT.name + " "
                if prob[i][IMRaDType.INTRODUCTION.value] == 1:
                    tmp += IMRaDType.INTRODUCTION.name + " "
                if prob[i][IMRaDType.BACKGROUND.value] == 1:
                    tmp += IMRaDType.BACKGROUND.name + " "
                if prob[i][IMRaDType.RESULTS.value] == 1:
                    tmp += IMRaDType.RESULTS.name + " "
                if prob[i][IMRaDType.DISCUSSION.value] == 1:
                    tmp += IMRaDType.DISCUSSION.name + " "
                if prob[i][IMRaDType.ACKNOWLEDGE.value] == 1:
                    tmp += IMRaDType.ACKNOWLEDGE.name + " "

                if tmp is not "":
                    print("{0}: {1}".format(chapter_names[i], tmp))
Esempio n. 2
0
class Preprocessor(object):
    def __init__(self):
        self.imrad_detector = IMRaDDetection()
        self.text_processor = TextProcessor()
        self.client = DBClient()

    def __add_paper_to_reference(self, paper1, paper2):
        if not paper2.title_proceed:
            return

        for ref in paper1.references:
            similarity = SequenceMatcher(None, ref.complete_ref_raw.lower(),
                                         paper2.title_raw.lower()).ratio()
            if similarity >= REFERENCE_SIMULARITY_THRESHOLD:
                ref.paper_id = [paper2.id, "automated"]
                self.client.update_paper(paper1)

    def proceed_paper(self, paper):
        self.text_processor.proceed_paper(paper)
        self.imrad_detector.proceed(paper)

    def proceed_queries(self, queries):
        queries_proceed = {} if "whole-document" in queries else {
            "whole-document": ""
        }
        for imrad_type, query in queries.items():
            queries_proceed[imrad_type] = self.text_processor.proceed_string(
                query)
        return queries_proceed

    def link_references(self, new_paper):
        for paper in self.client.get_all_paper():
            self.__add_paper_to_reference(paper, new_paper)
            self.__add_paper_to_reference(new_paper, paper)
Esempio n. 3
0
    def __init__(self, data):
        self.id = data.get('_id') if '_id' in data else ''
        self.filename = data.get('filename')

        self.title_raw = data.get('title_raw') if 'title_raw' in data else ''
        self.title_proceed = data.get(
            'title_proceed'
        ) if 'title_proceed' in data else TextProcessor.proceed_string(
            self.title_raw)

        self.authors = [Authors(author) for author in data.get('authors')
                        ] if 'authors' in data else []
        self.sections = [Section(section) for section in data.get('sections')
                         ] if 'sections' in data else []
        self.references = [
            Reference(reference) for reference in data.get('references')
        ] if 'references' in data else []
        self.cited_by = data.get('cited_by') if 'cited_by' in data else []

        self.word_hist = WordHist(
            data.get('word_hist')) if "word_hist" in data else WordHist()

        try:
            self.file = data.get('file') if 'file' in data else open(
                UPLOAD_FOLDER + self.filename, "rb").read()
        except FileNotFoundError as e:
            print("Cant import file: {}. This should only happen in Tests".
                  format(e))
            self.file = bytearray()
Esempio n. 4
0
 def __init__(self, data):
     self.text_type = TextType[data.get('text_type')]
     self.text_raw = data.get('text_raw')
     self.text_proceed = data.get(
         'text_proceed'
     ) if 'text_proceed' in data else TextProcessor.proceed_string(
         data.get('text_raw'))
Esempio n. 5
0
    def __init__(self, data):
        self.complete_ref_raw = data.get('complete_ref_raw')
        self.complete_ref_proceed = data.get('complete_ref_proceed') if 'complete_ref_proceed' in data else \
            TextProcessor.proceed_string(data.get('complete_ref_raw'))

        self.title = data.get('title') if 'title' in data else ''
        self.paper_id = data.get('paper_id') if 'paper_id' in data else ''

        self.authors = [[
            ReferenceType[author.get('author_type')],
            Author(author.get('author'))
        ] for author in data.get('authors')] if 'authors' in data else []
        self.reference_info = [[
            ReferenceType[info.get('reference_type')],
            info.get('reference_text')
        ] for info in data.get('reference_info')
                               ] if 'reference_info' in data else []
Esempio n. 6
0
    def __init__(self, data):
        self.heading_raw = data.get('heading_raw')
        self.heading_proceed = data.get('heading_proceed') if 'heading_proceed' in data else \
            TextProcessor.proceed_string(data.get('heading_raw'))

        self.section_type = SectionType[data.get('section_type')]

        self.imrad_types = [
            IMRaDType[imrad_type] for imrad_type in data.get('imrad_types')
        ] if 'imrad_types' in data else []
        self.text = [Text(text)
                     for text in data.get('text')] if 'text' in data else []
        self.subsections = [
            Section(subsection) for subsection in data.get('subsections')
        ] if 'subsections' in data else []

        self.word_hist = WordHist(
            data.get('word_hist')) if "word_hist" in data else WordHist()
Esempio n. 7
0
 def set_title(self, title_raw):
     if title_raw != '':
         self.title_raw = title_raw
         self.title_proceed = TextProcessor.proceed_string(title_raw)
Esempio n. 8
0
 def __init__(self):
     self.imrad_detector = IMRaDDetection()
     self.text_processor = TextProcessor()
     self.client = DBClient()