def parse_metadata(cls, xml_string): try: import xml.etree.ElementTree as ET metaXml = ET.ElementTree( ET.fromstring(xml_string, parser=recovering_parser)) except OSError: raise ParseError("NO METADATA!", fn) assert (isinstance(metaXml, ET.ElementTree)) def findAndCombine(query): return ";".join([ " ".join(x.itertext()) for x in metaXml.findall(query) ]).strip() metadata = {} metadata['type'] = metaXml.getroot().get("article-type") # print(doi) metadata['title'] = findAndCombine(".//article-title") metadata['journal'] = findAndCombine(".//journal-title") metadata['publisher'] = findAndCombine(".//publisher-name") metadata['abstract'] = findAndCombine(".//article-meta//abstract") auth = [] for group in metaXml.findall(".//contrib"): myname = " ".join(x.strip() for x in group.itertext()) myname = " ".join(re.split("\s+", myname)).strip() auth.append(myname) metadata['authors'] = auth if len(auth) == 0 and False: print('no authors...') print(auth) print(metadata['title']) metadata['month'] = findAndCombine(".//article-meta//month") metadata['year'] = findAndCombine(".//article-meta//year") if ";" in metadata['year']: metadata['year'] = metadata['year'].split(";")[0] try: metadata['year'] = int(metadata['year']) except: raise ParseError("No valid year found") metadata['volume'] = findAndCombine(".//article-meta//volume") metadata['issue'] = findAndCombine(".//article-meta//issue") metadata['fpage'] = findAndCombine(".//article-meta//fpage") metadata['lpage'] = findAndCombine(".//article-meta//lpage") return metadata
def clean_metadata(doi, metadata_str): try: metaXml = etree.fromstring(metadata_str, parser=recovering_parser) except OSError: raise ParseError("NO METADATA!", fn) def findAndCombine(query): return ";".join( [" ".join(x.itertext()) for x in metaXml.findall(query)]).strip() metadata = {} metadata['type'] = metaXml.get("article-type") metadata['doi'] = doi # print(doi) metadata['title'] = findAndCombine(".//article-title") metadata['journal'] = findAndCombine(".//journal-title") metadata['publisher'] = findAndCombine(".//publisher-name") metadata['abstract'] = findAndCombine(".//article-meta//abstract") auth = [] for group in metaXml.findall(".//contrib"): myname = " ".join(x.strip() for x in group.itertext()) myname = " ".join(re.split("\s+", myname)).strip() auth.append(myname) metadata['authors'] = auth if len(auth) == 0 and False: print(doi) print(auth) print(metadata['title']) metadata['month'] = findAndCombine(".//article-meta//month") metadata['year'] = findAndCombine(".//article-meta//year") if ";" in metadata['year']: metadata['year'] = metadata['year'].split(";")[0] try: metadata['year'] = int(metadata['year']) except: raise ParseError("No valid year found") metadata['volume'] = findAndCombine(".//article-meta//volume") metadata['issue'] = findAndCombine(".//article-meta//issue") metadata['fpage'] = findAndCombine(".//article-meta//fpage") metadata['lpage'] = findAndCombine(".//article-meta//lpage") return metadata
def find_bibliography(self): if self.type != 'research-article': print( "Unimplemented: finding bibliography on non-research-article") return num = 0 pid = None for pi, p in enumerate(self.pages): fr = re.findall("(REFERENCES|References|Literature Cited)", str(p)) if not len(fr): continue num += len(fr) pid = pi if num > 1: raise ParseError("More than one REFERENCES string!") if pid is None: raise ParseError("No REFERENCE string found...") #print("Found bibliography starting on page ", pid+1) bibString = [] for i in range(pid, len(self.pages)): if i == pid: bibString.append("".join( re.split("(REFERENCES|References|Literature Cited)", str(self.pages[i]))[2:])) continue bibString.append(str(self.pages[i])) self.bibString = "\n".join(bibString).strip() newptext = str(self.pages[pid]).split("REFERENCES")[0].strip() if newptext == "": #print("Deleting page", pid+1) self.pages.pop() else: #print("Truncating page", pid+1) self.pages[pid] = Page.from_text(newptext) #print("Deleting pages ", pid+2, "through", len(self.pages)) for i in range(len(self.pages) - pid): self.pages.pop()
def from_file(cls, fn, complex_parsing=True): metadata = cls.get_metadata(fn) page_strings = cls.get_page_strings(fn) page_strings = list(map(basic_ocr_cleaning, page_strings)) page_strings = [x for x in page_strings if x != ""] if not len(page_strings): raise ParseError("Empty document") return Document.from_pages(page_strings, metadata=metadata, complex_parsing=complex_parsing)
def match(self, text, pos=0): """Return the parse tree matching this expression at the given position, not necessarily extending all the way to the end of ``text``. Raise ``ParseError`` if there is no match there. :arg pos: The index at which to start matching """ error = ParseError(text) node = self.match_core(text, pos, {}, error) if node is None: raise error return node
def extract_sentences(self): from nltk import sent_tokenize, word_tokenize sents = [word_tokenize(ss) for ss in sent_tokenize(str(self))] if not len(sents): raise ParseError("Sentences not found") full_sentences = [ Sentence(words=sent_words) for sent_words in sents[1:-1] ] start_stub = sents[0] end_stub = sents[-1] self.start_stub = start_stub self.end_stub = end_stub self.full_sentences = full_sentences
def from_xml_strings(cls, meta_xml, content_xml, doi=None, complex_parsing=True): metadata = cls.parse_metadata(meta_xml) metadata['doi'] = doi page_strings = cls.parse_content(content_xml) page_strings = list(map(basic_ocr_cleaning, page_strings)) page_strings = [x for x in page_strings if x != ""] if not len(page_strings): raise ParseError("Empty document") return Document.from_pages(page_strings, metadata=metadata, complex_parsing=complex_parsing)
def from_file(cls, fn, complex_parsing=True): my_name = ".".join(fn.split(".")[:-1]) doi = my_name.split("-")[-1].replace("_", "/") metadataFn = "%s.xml" % my_name metadataFn = join(dirname(metadataFn), "..", "metadata", metadataFn) metadata = cls.parse_metadata(open(metadataFn).read()) page_strings = cls.parse_content(open(fn).read()) page_strings = list(map(basic_ocr_cleaning, page_strings)) page_strings = [x for x in page_strings if x != ""] if not len(page_strings): raise ParseError("Empty document") return Document.from_pages(page_strings, metadata=metadata, complex_parsing=complex_parsing)
def from_pages(cls, page_strings, metadata={}, complex_parsing=True): d = Document(metadata) if not len(page_strings): raise ParseError("No pages...") d.pages = [ Page.from_lines(re.split(r"[\n\r]+", page)) for page in page_strings ] if complex_parsing: if d.type == 'research-article': try: d.find_bibliography() d.extract_headers_footers() d.parse_bibliography() except ParseError: print("Couldn't extract bib and headers from ", d['doi']) raise return d
def extract_headers_footers(self): if self.type != 'research-article': print( "Unimplemented: extracting headers and footers on non-research-article" ) return if not len(self.pages): raise ParseError() # extracting title, heading stuff on first page, and abstract fp = str(self.pages[0]).upper() found = [] f = fp.find(self.metadata['title'].upper()) found.append((f, len(self.metadata['title']))) for a in self.metadata['authors']: f = fp.find(a.upper()) found.append((f, len(a))) # to get the university, not included in most metadatas... found_uni = [] for u in ALL_UNIVERSITIES: found.append((fp.find(u), len(u))) # limit to those strings which were actually found found = list(filter(lambda x: x[0] >= 0, found)) if len(found): # these strings should be "end to end", i.e. less than 5 characters separating them found_ends = [x[0] + x[1] for x in found] found_min_beginning = min(x[0] for x in found) found = list( filter( lambda x: any(abs(x[0] - y) < 5 for y in found_ends) or x[ 0] == found_min_beginning, found)) # we then cut after these metadata fields! doc_start = max(found, key=lambda x: x[0] + x[1]) doc_start = doc_start[0] + doc_start[1] newfp_text = str(self.pages[0])[doc_start:].strip() #newfp = Page.from_text(newfp_text) self.pages[0] = newfp_text # extracting headers via voting num_identical_necessary = len(self.pages) / 4 flines = [str(x).split()[:20] for x in self.pages] grams = Counter(tuple(x[:i]) for i in range(1, 15) for x in flines) candidates = set(x for x, c in grams.items() if c > num_identical_necessary) maximal_candidates = [] for c in candidates: keep = True for c2 in candidates: if len(c) >= len(c2): continue if c[:min(len(c), len(c2))] == c2[:min(len(c), len(c2))]: keep = False if keep: maximal_candidates.append(c) self.headers = [" ".join(x) for x in maximal_candidates] for h in self.headers: # don't want to get rid of the title on the first page. # that's the job for another algorithm # therefore, the [1:] for i, p in enumerate(self.pages[1:]): try: if str(p).index(h) < 30: self.pages[i] = Page.from_text( str(p).replace(h, "").strip()) except ValueError: continue
def get_metadata(cls, fn): my_name = ".".join(fn.split(".")[:-1]) doi = my_name.split("-")[-1].replace("_", "/") metadataFn = "%s.xml" % my_name metadataFn = join(dirname(metadataFn), "..", "metadata", basename(metadataFn)) try: metaXml = etree.parse(metadataFn, parser=recovering_parser) except OSError: raise ParseError("NO METADATA!", fn) assert (isinstance(metaXml, ElementTree)) def findAndCombine(query): return ";".join([ " ".join(x.itertext()) for x in metaXml.findall(query) ]).strip() metadata = {} metadata['type'] = metaXml.getroot().get("article-type") metadata['doi'] = doi # print(doi) metadata['title'] = findAndCombine(".//article-title") metadata['journal'] = findAndCombine(".//journal-title") metadata['publisher'] = findAndCombine(".//publisher-name") metadata['abstract'] = findAndCombine(".//article-meta//abstract") auth = [] for group in metaXml.findall(".//contrib"): myname = " ".join(x.strip() for x in group.itertext()) myname = " ".join(re.split("\s+", myname)).strip() auth.append(myname) metadata['authors'] = auth if len(auth) == 0: print(doi) print(auth) print(metadata['title']) metadata['month'] = findAndCombine(".//article-meta//month") metadata['year'] = findAndCombine(".//article-meta//year") if ";" in metadata['year']: metadata['year'] = metadata['year'].split(";")[0] try: metadata['year'] = int(metadata['year']) except: raise ParseError("No valid year found") metadata['volume'] = findAndCombine(".//article-meta//volume") metadata['issue'] = findAndCombine(".//article-meta//issue") metadata['fpage'] = findAndCombine(".//article-meta//fpage") metadata['lpage'] = findAndCombine(".//article-meta//lpage") return metadata
def eliminate_metadata(self): from fuzzysearch import find_near_matches if self.type != 'research-article': print( "Unimplemented: extracting headers and footers on non-research-article" ) return if not len(self.pages): raise ParseError() fp = str(self.pages[0]) fp = fp.upper() # extracting title, heading stuff on first page, and abstract to_extract_from_first_page = [self['title']] if len(self['abstract']): to_extract_from_first_page.append(self['abstract']) for a in self['authors']: arev = a.split() arev = arev[-1] + ", " + " ".join(arev[:-1]) to_extract_from_first_page += [a, arev] found = [] # to get the university, not included in most metadatas... if False: found_uni = [] for u in ALL_UNIVERSITIES: found.append((fp.find(u), len(u))) print(to_extract_from_first_page) for word in to_extract_from_first_page: for match in find_near_matches(word.upper(), fp, max_l_dist=len(word) // 10): # match = word[match.start:match.end] # print('match {}'.format(match)) # index = ls.find(match) print(match.start, match.end, match.matched, match.dist) found.append((match.start, match.end - match.start)) # limit to those strings which were actually found found = list(filter(lambda x: x[0] >= 0, found)) if len(found): # these strings should be "end to end", i.e. less than 5 characters separating them found_ends = [x[0] + x[1] for x in found] found_min_beginning = min(x[0] for x in found) found = list( filter( lambda x: any(abs(x[0] - y) < 5 for y in found_ends) or x[ 0] == found_min_beginning, found)) # we then cut after these metadata fields! doc_start = max(found, key=lambda x: x[0] + x[1]) doc_start = doc_start[0] + doc_start[1] newfp_text = str(self.pages[0])[doc_start:].strip() # newfp = Page.from_text(newfp_text) self.pages[0] = newfp_text