Python WordHist Exemples, engine.utils.objects.word_hist.WordHist Python Exemples

Exemple #1

0

Afficher le fichier

    def __init__(self, data):
        self.id = data.get('_id') if '_id' in data else ''
        self.filename = data.get('filename')

        self.title_raw = data.get('title_raw') if 'title_raw' in data else ''
        self.title_proceed = data.get(
            'title_proceed'
        ) if 'title_proceed' in data else TextProcessor.proceed_string(
            self.title_raw)

        self.authors = [Authors(author) for author in data.get('authors')
                        ] if 'authors' in data else []
        self.sections = [Section(section) for section in data.get('sections')
                         ] if 'sections' in data else []
        self.references = [
            Reference(reference) for reference in data.get('references')
        ] if 'references' in data else []
        self.cited_by = data.get('cited_by') if 'cited_by' in data else []

        self.word_hist = WordHist(
            data.get('word_hist')) if "word_hist" in data else WordHist()

        try:
            self.file = data.get('file') if 'file' in data else open(
                UPLOAD_FOLDER + self.filename, "rb").read()
        except FileNotFoundError as e:
            print("Cant import file: {}. This should only happen in Tests".
                  format(e))
            self.file = bytearray()

Exemple #2

0

Afficher le fichier

Fichier : section.py Projet : thomasmauerhofer/search-engine

    def get_combined_word_hist(self):
        if not self.word_hist:
            for word in self.heading_proceed.split():
                self.word_hist[word] = self.word_hist[
                    word] + 1 if word in self.word_hist else 1

            for text in self.text:
                for word in text.text_proceed.split():
                    self.word_hist[word] = self.word_hist[
                        word] + 1 if word in self.word_hist else 1

        ret = WordHist(self.word_hist.copy())
        for subsection in self.subsections:
            ret.append(subsection.get_combined_word_hist())
        return ret

Exemple #3

0

Afficher le fichier

Fichier : section.py Projet : thomasmauerhofer/search-engine

    def __init__(self, data):
        self.heading_raw = data.get('heading_raw')
        self.heading_proceed = data.get('heading_proceed') if 'heading_proceed' in data else \
            TextProcessor.proceed_string(data.get('heading_raw'))

        self.section_type = SectionType[data.get('section_type')]

        self.imrad_types = [
            IMRaDType[imrad_type] for imrad_type in data.get('imrad_types')
        ] if 'imrad_types' in data else []
        self.text = [Text(text)
                     for text in data.get('text')] if 'text' in data else []
        self.subsections = [
            Section(subsection) for subsection in data.get('subsections')
        ] if 'subsections' in data else []

        self.word_hist = WordHist(
            data.get('word_hist')) if "word_hist" in data else WordHist()

Exemple #4

0

Afficher le fichier

    def get_combined_word_hist(self):
        if not self.word_hist:
            for word in self.title_proceed.split():
                self.word_hist[word] = self.word_hist[
                    word] + 1 if word in self.word_hist else 1

            for section in self.sections:
                self.word_hist.append(section.get_combined_word_hist())

        return WordHist(self.word_hist.copy())

Exemple #5

0

Afficher le fichier

Fichier : db_client.py Projet : thomasmauerhofer/search-engine

 def __calculate_all_papers_hist(self):
     papers = self.get_all_paper()
     intro, background, methods, result, discussion, overall = WordHist(
     ), WordHist(), WordHist(), WordHist(), WordHist(), WordHist()
     for paper in papers:
         overall.append(paper.get_combined_word_hist())
         intro.append(sections_to_word_hist(paper.get_introduction()))
         background.append(sections_to_word_hist(paper.get_background()))
         methods.append(sections_to_word_hist(paper.get_methods()))
         result.append(sections_to_word_hist(paper.get_results()))
         discussion.append(sections_to_word_hist(paper.get_discussion()))
     return {
         "whole-document": overall,
         IMRaDType.INTRODUCTION.name: intro,
         IMRaDType.BACKGROUND.name: background,
         IMRaDType.METHODS.name: methods,
         IMRaDType.RESULTS.name: result,
         IMRaDType.DISCUSSION.name: discussion
     }

Exemple #6

0

Afficher le fichier

class Paper(PaperStructure):
    def __init__(self, data):
        self.id = data.get('_id') if '_id' in data else ''
        self.filename = data.get('filename')

        self.title_raw = data.get('title_raw') if 'title_raw' in data else ''
        self.title_proceed = data.get(
            'title_proceed'
        ) if 'title_proceed' in data else TextProcessor.proceed_string(
            self.title_raw)

        self.authors = [Authors(author) for author in data.get('authors')
                        ] if 'authors' in data else []
        self.sections = [Section(section) for section in data.get('sections')
                         ] if 'sections' in data else []
        self.references = [
            Reference(reference) for reference in data.get('references')
        ] if 'references' in data else []
        self.cited_by = data.get('cited_by') if 'cited_by' in data else []

        self.word_hist = WordHist(
            data.get('word_hist')) if "word_hist" in data else WordHist()

        try:
            self.file = data.get('file') if 'file' in data else open(
                UPLOAD_FOLDER + self.filename, "rb").read()
        except FileNotFoundError as e:
            print("Cant import file: {}. This should only happen in Tests".
                  format(e))
            self.file = bytearray()

    def __str__(self):
        pp = pprint.PrettyPrinter(indent=4)
        return pp.pformat(self.to_dict())

    def __eq__(self, other):
        return self.file == other.file

    def get_sections_with_imrad_type(self, imrad_type):
        if imrad_type == "whole-document":
            return self.sections
        imrad_type = IMRaDType[imrad_type] if isinstance(imrad_type,
                                                         str) else imrad_type
        return [
            chapter for chapter in self.sections
            if imrad_type in chapter.imrad_types
        ]

    def get_sections_with_an_imrad_type(self):
        return [
            chapter for chapter in self.sections
            if (IMRaDType.INTRODUCTION in chapter.imrad_types or IMRaDType.
                BACKGROUND in chapter.imrad_types or IMRaDType.METHODS in
                chapter.imrad_types or IMRaDType.RESULTS in chapter.imrad_types
                or IMRaDType.DISCUSSION in chapter.imrad_types)
        ]

    def get_sections_without_an_imrad_type(self):
        return [
            chapter for chapter in self.sections
            if (not len(chapter.imrad_types) or IMRaDType.ABSTRACT in chapter.
                imrad_types or IMRaDType.ACKNOWLEDGE in chapter.imrad_types)
        ]

    def to_dict(self):
        data = {
            'filename': self.filename,
            'title_raw': self.title_raw,
            'title_proceed': self.title_proceed,
            'file': self.file,
            'authors': [],
            'sections': [],
            'references': [],
            'cited_by': self.cited_by,
            'word_hist': self.word_hist
        }

        for author in self.authors:
            data['authors'].append(author.to_dict())
        for section in self.sections:
            data['sections'].append(section.to_dict())
        for reference in self.references:
            data['references'].append(reference.to_dict())

        return data

    def get_combined_word_hist(self):
        if not self.word_hist:
            for word in self.title_proceed.split():
                self.word_hist[word] = self.word_hist[
                    word] + 1 if word in self.word_hist else 1

            for section in self.sections:
                self.word_hist.append(section.get_combined_word_hist())

        return WordHist(self.word_hist.copy())

    def set_title(self, title_raw):
        if title_raw != '':
            self.title_raw = title_raw
            self.title_proceed = TextProcessor.proceed_string(title_raw)

    def add_abstract(self, text):
        self.sections.append(
            Section({
                'section_type': SectionType.ABSTRACT.name,
                'heading_raw': 'abstract'
            }))
        self.sections[-1].imrad_types.append(IMRaDType.ABSTRACT)
        self.add_text_to_current_section(TextType.MAIN, text)

    def add_section(self, section_name):
        self.sections.append(
            Section({
                'section_type': SectionType.SECTION.name,
                'heading_raw': section_name
            }))

    def add_subsection(self, section_name):
        if not len(self.sections):
            self.add_abstract('')
        self.sections[-1].add_subsection(SectionType.SUBSECTION, section_name)

    def add_subsubsection(self, section_name):
        if not len(self.sections):
            self.add_abstract('')

        if not len(self.sections[-1].subsections):
            self.add_subsection('')

        self.sections[-1].subsections[-1].add_subsection(
            SectionType.SUBSUBSECTION, section_name)

    def add_text_to_current_section(self, text_type, text):
        if not len(self.sections):
            self.add_section('')
        self.sections[-1].add_text_object(text_type, text)

    def add_reference(self, full_reference):
        self.references.append(Reference({'complete_ref_raw': full_reference}))

    def add_authors_text(self, full_authors):
        self.authors.append(Authors({'all_authors_text': full_authors}))

    def get_introduction(self):
        return self.get_sections_with_imrad_type(IMRaDType.INTRODUCTION)

    def get_background(self):
        return self.get_sections_with_imrad_type(IMRaDType.BACKGROUND)

    def get_methods(self):
        return self.get_sections_with_imrad_type(IMRaDType.METHODS)

    def get_results(self):
        return self.get_sections_with_imrad_type(IMRaDType.RESULTS)

    def get_discussion(self):
        return self.get_sections_with_imrad_type(IMRaDType.DISCUSSION)

    def save_file_to_path(self, path):
        open(path + self.filename, 'wb').write(self.file)
        return path + self.filename

    def title_exist(self):
        return bool(self.title_proceed)

    def section_title_exist(self):
        return any([section.title_exist() for section in self.sections])

    def section_text_exist(self):
        return any([section.text_exist() for section in self.sections])

    def subsection_title_exist(self):
        return any(subsection.title_exist() for section in self.sections
                   for subsection in section.subsections)

    def subsection_text_exist(self):
        return any(subsection.text_exist() for section in self.sections
                   for subsection in section.subsections)

    def subsubsection_title_exist(self):
        return any(subsubsection.title_exist() for section in self.sections
                   for subsection in section.subsections
                   for subsubsection in subsection.subsections)

    def subsubsection_text_exist(self):
        return any(subsubsection.text_exist() for section in self.sections
                   for subsection in section.subsections
                   for subsubsection in subsection.subsections)

Exemple #7

0

Afficher le fichier

Fichier : section.py Projet : thomasmauerhofer/search-engine

class Section(PaperStructure):
    def __init__(self, data):
        self.heading_raw = data.get('heading_raw')
        self.heading_proceed = data.get('heading_proceed') if 'heading_proceed' in data else \
            TextProcessor.proceed_string(data.get('heading_raw'))

        self.section_type = SectionType[data.get('section_type')]

        self.imrad_types = [
            IMRaDType[imrad_type] for imrad_type in data.get('imrad_types')
        ] if 'imrad_types' in data else []
        self.text = [Text(text)
                     for text in data.get('text')] if 'text' in data else []
        self.subsections = [
            Section(subsection) for subsection in data.get('subsections')
        ] if 'subsections' in data else []

        self.word_hist = WordHist(
            data.get('word_hist')) if "word_hist" in data else WordHist()

    def __str__(self):
        pp = pprint.PrettyPrinter(indent=4)
        return pp.pformat(self.to_dict())

    def to_dict(self):
        data = {
            'section_type': self.section_type.name,
            'heading_raw': self.heading_raw,
            'heading_proceed': self.heading_proceed,
            'text': [],
            'subsections': [],
            'imrad_types': [],
            'word_hist': self.word_hist
        }

        for text in self.text:
            data['text'].append(text.to_dict())

        for subsection in self.subsections:
            data['subsections'].append(subsection.to_dict())

        for imrad_type in self.imrad_types:
            data['imrad_types'].append(imrad_type.name)

        return data

    def get_combined_word_hist(self):
        if not self.word_hist:
            for word in self.heading_proceed.split():
                self.word_hist[word] = self.word_hist[
                    word] + 1 if word in self.word_hist else 1

            for text in self.text:
                for word in text.text_proceed.split():
                    self.word_hist[word] = self.word_hist[
                        word] + 1 if word in self.word_hist else 1

        ret = WordHist(self.word_hist.copy())
        for subsection in self.subsections:
            ret.append(subsection.get_combined_word_hist())
        return ret

    def add_text_object(self, text_type, text_raw):
        if len(self.subsections):
            self.subsections[-1].add_text_object(text_type, text_raw)
        else:
            self.text.append(
                Text({
                    "text_type": text_type.name,
                    "text_raw": text_raw
                }))

    def add_subsection(self, section_type, heading):
        self.subsections.append(
            Section({
                'section_type': section_type.name,
                'heading_raw': heading
            }))

    def add_to_imrad(self, imrad_type):
        if not any(imrad_type is x for x in self.imrad_types) and \
                (not (self.heading_raw.isspace() or self.heading_raw is '')):
            self.imrad_types.append(imrad_type)
            for subsection in self.subsections:
                subsection.add_to_imrad(imrad_type)

    def title_exist(self):
        return bool(self.heading_proceed)

    def text_exist(self):
        return any([text for text in self.text if text.text_proceed])

Exemple #8

0

Afficher le fichier

def sections_to_word_hist(sections):
    hist = WordHist()
    for section in sections:
        hist.append(section.get_combined_word_hist())
    return hist