Beispiel #1
0
def process_one_file(bibcode, fname, provider):
    ext = fname.split('.')[-1]
    d = {
        CONSTANTS["BIBCODE"]: bibcode,
        CONSTANTS["PROVIDER"]: provider,
        CONSTANTS["FORMAT"]: ext,
        CONSTANTS["FILE_SOURCE"]: fname
    }
    extractor = StandardFileExtract.StandardExtractorXML(d)
    extractor.open_xml()
    xml = extractor.parse_xml()
    sections = xml.xpath(sections_xpath) or xml.xpath(paragraphs_xpath)
    summary = sections[-1].text_content()
    sys.stderr.write("summary is of type {}\n".format(type(summary)))
    text = TextCleaner(text=summary)
    if sections:
        summary = unicode(sections[-1].text_content())
    if summary:
        text = TextCleaner(text=summary).run()
    return text