Ejemplo n.º 1
0
    def parse_metadata(cls, xml_string):

        try:
            import xml.etree.ElementTree as ET
            metaXml = ET.ElementTree(
                ET.fromstring(xml_string, parser=recovering_parser))
        except OSError:
            raise ParseError("NO METADATA!", fn)

        assert (isinstance(metaXml, ET.ElementTree))

        def findAndCombine(query):
            return ";".join([
                " ".join(x.itertext()) for x in metaXml.findall(query)
            ]).strip()

        metadata = {}
        metadata['type'] = metaXml.getroot().get("article-type")

        # print(doi)

        metadata['title'] = findAndCombine(".//article-title")
        metadata['journal'] = findAndCombine(".//journal-title")
        metadata['publisher'] = findAndCombine(".//publisher-name")

        metadata['abstract'] = findAndCombine(".//article-meta//abstract")

        auth = []
        for group in metaXml.findall(".//contrib"):
            myname = " ".join(x.strip() for x in group.itertext())
            myname = " ".join(re.split("\s+", myname)).strip()
            auth.append(myname)
        metadata['authors'] = auth

        if len(auth) == 0 and False:
            print('no authors...')
            print(auth)
            print(metadata['title'])

        metadata['month'] = findAndCombine(".//article-meta//month")

        metadata['year'] = findAndCombine(".//article-meta//year")
        if ";" in metadata['year']:
            metadata['year'] = metadata['year'].split(";")[0]
        try:
            metadata['year'] = int(metadata['year'])
        except:
            raise ParseError("No valid year found")

        metadata['volume'] = findAndCombine(".//article-meta//volume")
        metadata['issue'] = findAndCombine(".//article-meta//issue")

        metadata['fpage'] = findAndCombine(".//article-meta//fpage")
        metadata['lpage'] = findAndCombine(".//article-meta//lpage")

        return metadata
Ejemplo n.º 2
0
def clean_metadata(doi, metadata_str):

    try:
        metaXml = etree.fromstring(metadata_str, parser=recovering_parser)
    except OSError:
        raise ParseError("NO METADATA!", fn)

    def findAndCombine(query):
        return ";".join(
            [" ".join(x.itertext()) for x in metaXml.findall(query)]).strip()

    metadata = {}
    metadata['type'] = metaXml.get("article-type")

    metadata['doi'] = doi
    # print(doi)

    metadata['title'] = findAndCombine(".//article-title")
    metadata['journal'] = findAndCombine(".//journal-title")
    metadata['publisher'] = findAndCombine(".//publisher-name")

    metadata['abstract'] = findAndCombine(".//article-meta//abstract")

    auth = []
    for group in metaXml.findall(".//contrib"):
        myname = " ".join(x.strip() for x in group.itertext())
        myname = " ".join(re.split("\s+", myname)).strip()
        auth.append(myname)
    metadata['authors'] = auth

    if len(auth) == 0 and False:
        print(doi)
        print(auth)
        print(metadata['title'])

    metadata['month'] = findAndCombine(".//article-meta//month")

    metadata['year'] = findAndCombine(".//article-meta//year")
    if ";" in metadata['year']:
        metadata['year'] = metadata['year'].split(";")[0]
    try:
        metadata['year'] = int(metadata['year'])
    except:
        raise ParseError("No valid year found")

    metadata['volume'] = findAndCombine(".//article-meta//volume")
    metadata['issue'] = findAndCombine(".//article-meta//issue")

    metadata['fpage'] = findAndCombine(".//article-meta//fpage")
    metadata['lpage'] = findAndCombine(".//article-meta//lpage")

    return metadata
Ejemplo n.º 3
0
    def find_bibliography(self):
        if self.type != 'research-article':
            print(
                "Unimplemented: finding bibliography on non-research-article")
            return

        num = 0
        pid = None
        for pi, p in enumerate(self.pages):
            fr = re.findall("(REFERENCES|References|Literature Cited)", str(p))
            if not len(fr):
                continue
            num += len(fr)
            pid = pi

        if num > 1:
            raise ParseError("More than one REFERENCES string!")

        if pid is None:
            raise ParseError("No REFERENCE string found...")

        #print("Found bibliography starting on page ", pid+1)

        bibString = []
        for i in range(pid, len(self.pages)):
            if i == pid:
                bibString.append("".join(
                    re.split("(REFERENCES|References|Literature Cited)",
                             str(self.pages[i]))[2:]))
                continue

            bibString.append(str(self.pages[i]))

        self.bibString = "\n".join(bibString).strip()

        newptext = str(self.pages[pid]).split("REFERENCES")[0].strip()
        if newptext == "":
            #print("Deleting page", pid+1)
            self.pages.pop()
        else:
            #print("Truncating page", pid+1)
            self.pages[pid] = Page.from_text(newptext)

        #print("Deleting pages ", pid+2, "through", len(self.pages))
        for i in range(len(self.pages) - pid):
            self.pages.pop()
Ejemplo n.º 4
0
    def from_file(cls, fn, complex_parsing=True):
        metadata = cls.get_metadata(fn)
        page_strings = cls.get_page_strings(fn)

        page_strings = list(map(basic_ocr_cleaning, page_strings))
        page_strings = [x for x in page_strings if x != ""]
        if not len(page_strings):
            raise ParseError("Empty document")

        return Document.from_pages(page_strings,
                                   metadata=metadata,
                                   complex_parsing=complex_parsing)
Ejemplo n.º 5
0
    def match(self, text, pos=0):
        """Return the parse tree matching this expression at the given
        position, not necessarily extending all the way to the end of ``text``.

        Raise ``ParseError`` if there is no match there.

        :arg pos: The index at which to start matching

        """
        error = ParseError(text)
        node = self.match_core(text, pos, {}, error)
        if node is None:
            raise error
        return node
Ejemplo n.º 6
0
    def extract_sentences(self):
        from nltk import sent_tokenize, word_tokenize

        sents = [word_tokenize(ss) for ss in sent_tokenize(str(self))]

        if not len(sents):
            raise ParseError("Sentences not found")

        full_sentences = [
            Sentence(words=sent_words) for sent_words in sents[1:-1]
        ]
        start_stub = sents[0]
        end_stub = sents[-1]

        self.start_stub = start_stub
        self.end_stub = end_stub
        self.full_sentences = full_sentences
Ejemplo n.º 7
0
    def from_xml_strings(cls,
                         meta_xml,
                         content_xml,
                         doi=None,
                         complex_parsing=True):

        metadata = cls.parse_metadata(meta_xml)
        metadata['doi'] = doi
        page_strings = cls.parse_content(content_xml)

        page_strings = list(map(basic_ocr_cleaning, page_strings))
        page_strings = [x for x in page_strings if x != ""]
        if not len(page_strings):
            raise ParseError("Empty document")

        return Document.from_pages(page_strings,
                                   metadata=metadata,
                                   complex_parsing=complex_parsing)
Ejemplo n.º 8
0
    def from_file(cls, fn, complex_parsing=True):

        my_name = ".".join(fn.split(".")[:-1])
        doi = my_name.split("-")[-1].replace("_", "/")

        metadataFn = "%s.xml" % my_name
        metadataFn = join(dirname(metadataFn), "..", "metadata", metadataFn)

        metadata = cls.parse_metadata(open(metadataFn).read())

        page_strings = cls.parse_content(open(fn).read())

        page_strings = list(map(basic_ocr_cleaning, page_strings))
        page_strings = [x for x in page_strings if x != ""]
        if not len(page_strings):
            raise ParseError("Empty document")

        return Document.from_pages(page_strings,
                                   metadata=metadata,
                                   complex_parsing=complex_parsing)
Ejemplo n.º 9
0
    def from_pages(cls, page_strings, metadata={}, complex_parsing=True):
        d = Document(metadata)

        if not len(page_strings):
            raise ParseError("No pages...")

        d.pages = [
            Page.from_lines(re.split(r"[\n\r]+", page))
            for page in page_strings
        ]

        if complex_parsing:
            if d.type == 'research-article':
                try:
                    d.find_bibliography()
                    d.extract_headers_footers()
                    d.parse_bibliography()
                except ParseError:
                    print("Couldn't extract bib and headers from ", d['doi'])
                    raise

        return d
Ejemplo n.º 10
0
    def extract_headers_footers(self):
        if self.type != 'research-article':
            print(
                "Unimplemented: extracting headers and footers on non-research-article"
            )
            return

        if not len(self.pages):
            raise ParseError()

        # extracting title, heading stuff on first page, and abstract
        fp = str(self.pages[0]).upper()
        found = []

        f = fp.find(self.metadata['title'].upper())
        found.append((f, len(self.metadata['title'])))

        for a in self.metadata['authors']:
            f = fp.find(a.upper())
            found.append((f, len(a)))

        # to get the university, not included in most metadatas...
        found_uni = []
        for u in ALL_UNIVERSITIES:
            found.append((fp.find(u), len(u)))

        # limit to those strings which were actually found
        found = list(filter(lambda x: x[0] >= 0, found))

        if len(found):
            # these strings should be "end to end", i.e. less than 5 characters separating them
            found_ends = [x[0] + x[1] for x in found]
            found_min_beginning = min(x[0] for x in found)
            found = list(
                filter(
                    lambda x: any(abs(x[0] - y) < 5 for y in found_ends) or x[
                        0] == found_min_beginning, found))

            # we then cut after these metadata fields!
            doc_start = max(found, key=lambda x: x[0] + x[1])
            doc_start = doc_start[0] + doc_start[1]

            newfp_text = str(self.pages[0])[doc_start:].strip()
            #newfp = Page.from_text(newfp_text)
            self.pages[0] = newfp_text

        # extracting headers via voting
        num_identical_necessary = len(self.pages) / 4

        flines = [str(x).split()[:20] for x in self.pages]
        grams = Counter(tuple(x[:i]) for i in range(1, 15) for x in flines)

        candidates = set(x for x, c in grams.items()
                         if c > num_identical_necessary)
        maximal_candidates = []
        for c in candidates:
            keep = True
            for c2 in candidates:
                if len(c) >= len(c2):
                    continue
                if c[:min(len(c), len(c2))] == c2[:min(len(c), len(c2))]:
                    keep = False

            if keep:
                maximal_candidates.append(c)

        self.headers = [" ".join(x) for x in maximal_candidates]
        for h in self.headers:

            # don't want to get rid of the title on the first page.
            # that's the job for another algorithm
            # therefore, the [1:]
            for i, p in enumerate(self.pages[1:]):
                try:
                    if str(p).index(h) < 30:
                        self.pages[i] = Page.from_text(
                            str(p).replace(h, "").strip())
                except ValueError:
                    continue
Ejemplo n.º 11
0
    def get_metadata(cls, fn):

        my_name = ".".join(fn.split(".")[:-1])
        doi = my_name.split("-")[-1].replace("_", "/")

        metadataFn = "%s.xml" % my_name
        metadataFn = join(dirname(metadataFn), "..", "metadata",
                          basename(metadataFn))

        try:
            metaXml = etree.parse(metadataFn, parser=recovering_parser)
        except OSError:
            raise ParseError("NO METADATA!", fn)

        assert (isinstance(metaXml, ElementTree))

        def findAndCombine(query):
            return ";".join([
                " ".join(x.itertext()) for x in metaXml.findall(query)
            ]).strip()

        metadata = {}
        metadata['type'] = metaXml.getroot().get("article-type")

        metadata['doi'] = doi
        # print(doi)

        metadata['title'] = findAndCombine(".//article-title")
        metadata['journal'] = findAndCombine(".//journal-title")
        metadata['publisher'] = findAndCombine(".//publisher-name")

        metadata['abstract'] = findAndCombine(".//article-meta//abstract")

        auth = []
        for group in metaXml.findall(".//contrib"):
            myname = " ".join(x.strip() for x in group.itertext())
            myname = " ".join(re.split("\s+", myname)).strip()
            auth.append(myname)
        metadata['authors'] = auth

        if len(auth) == 0:
            print(doi)
            print(auth)
            print(metadata['title'])

        metadata['month'] = findAndCombine(".//article-meta//month")

        metadata['year'] = findAndCombine(".//article-meta//year")
        if ";" in metadata['year']:
            metadata['year'] = metadata['year'].split(";")[0]
        try:
            metadata['year'] = int(metadata['year'])
        except:
            raise ParseError("No valid year found")

        metadata['volume'] = findAndCombine(".//article-meta//volume")
        metadata['issue'] = findAndCombine(".//article-meta//issue")

        metadata['fpage'] = findAndCombine(".//article-meta//fpage")
        metadata['lpage'] = findAndCombine(".//article-meta//lpage")

        return metadata
Ejemplo n.º 12
0
    def eliminate_metadata(self):
        from fuzzysearch import find_near_matches

        if self.type != 'research-article':
            print(
                "Unimplemented: extracting headers and footers on non-research-article"
            )
            return

        if not len(self.pages):
            raise ParseError()

        fp = str(self.pages[0])
        fp = fp.upper()

        # extracting title, heading stuff on first page, and abstract
        to_extract_from_first_page = [self['title']]

        if len(self['abstract']):
            to_extract_from_first_page.append(self['abstract'])

        for a in self['authors']:
            arev = a.split()
            arev = arev[-1] + ", " + " ".join(arev[:-1])

            to_extract_from_first_page += [a, arev]

        found = []

        # to get the university, not included in most metadatas...
        if False:
            found_uni = []
            for u in ALL_UNIVERSITIES:
                found.append((fp.find(u), len(u)))
        print(to_extract_from_first_page)

        for word in to_extract_from_first_page:
            for match in find_near_matches(word.upper(),
                                           fp,
                                           max_l_dist=len(word) // 10):
                # match = word[match.start:match.end]
                # print('match {}'.format(match))
                # index = ls.find(match)
                print(match.start, match.end, match.matched, match.dist)
                found.append((match.start, match.end - match.start))

        # limit to those strings which were actually found
        found = list(filter(lambda x: x[0] >= 0, found))

        if len(found):
            # these strings should be "end to end", i.e. less than 5 characters separating them
            found_ends = [x[0] + x[1] for x in found]
            found_min_beginning = min(x[0] for x in found)
            found = list(
                filter(
                    lambda x: any(abs(x[0] - y) < 5 for y in found_ends) or x[
                        0] == found_min_beginning, found))

            # we then cut after these metadata fields!
            doc_start = max(found, key=lambda x: x[0] + x[1])
            doc_start = doc_start[0] + doc_start[1]

            newfp_text = str(self.pages[0])[doc_start:].strip()
            # newfp = Page.from_text(newfp_text)
            self.pages[0] = newfp_text