예제 #1
0
def createFromScrapedDefinitions():
    common_logger.info("Creating AcronymDB")
    csv.field_size_limit(sys.maxint)

    acronymDB = {}
    loaded_acronyms = 0
    for definition_file in file_scraped_definitions_list:
        # open as csv file with headers
        acronym_csv = csv.DictReader(
            open(definition_file, "rb"), delimiter=",")

        for row in acronym_csv:
            acronym = toUnicode(row["acronym"])
            acronym_expansion = toUnicode(row["acronym_expansion"])
            article_id = toUnicode(row["article_id"])
            if(acronym not in acronymDB):
                acronymDB[acronym] = []
            acronymDB[acronym].append([acronym_expansion
                                       .strip().lower().replace('-', ' '), article_id])
            # , row["article_title"]]) # title was part of old format
            loaded_acronyms += 1
            if(loaded_acronyms % 10000 == 0):
                common_logger.debug("loaded %d acronyms", loaded_acronyms)

    common_logger.info("adding def_count values to acronymDB")
    defs_per_acronym = [0] * 1000
    insts_per_def = [0] * 1000
    #num_acronyms = len(acronymDB)
    for acronym, values_for_this_acronym in acronymDB.items():
        values_for_this_acronym = sorted(
            values_for_this_acronym, key=lambda x: x[0])

        def_count = 0
        inst_count = 0
        expansion_of_last_acronym = values_for_this_acronym[0][0]
        #, article_title]\ # title was part of old format in the line below
        for index, [acronym_expansion, article_id]\
                in enumerate(values_for_this_acronym):
            if AcronymExpansion.startsSameWay(acronym_expansion, expansion_of_last_acronym):
                inst_count += 1
                values_for_this_acronym[index].append(def_count)
                values_for_this_acronym[index][0] = expansion_of_last_acronym
            else:
                insts_per_def[min(inst_count, len(insts_per_def) - 1)] += 1
                inst_count = 0
                def_count += 1
                expansion_of_last_acronym = acronym_expansion
                values_for_this_acronym[index].append(def_count)
        defs_per_acronym[min(def_count, len(defs_per_acronym) - 1)] += 1
        acronymDB[acronym] = numpy.array(values_for_this_acronym)

    dump(acronymDB)
    common_logger.info("Dumped AcronymDB successfully")
예제 #2
0
    def get_text(self, file_path):
        if file_path[-4:] == '.txt':
            return open(file_path).read()
        txt_path = file_path
        if file_path[-4:] == '.pdf':
            txt_path = file_path[:-4] + '.txt'
        elif file_path[-4:] != '.txt':
            txt_path = file_path + '.txt'

    #    if (os.file_path.isfile(txt_path)):
    #        return open(txt_path).read()
        if file_path[-4:] != '.pdf':
            file_path = file_path + '.pdf'
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(file_path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        result = toUnicode(retstr.getvalue())
        retstr.close()

        return result
예제 #3
0
def createFromScrapedArticles():
    common_logger.info("Creating ArticleDB")
    csv.field_size_limit(sys.maxint)

    articleDB = {}
    loaded_articles = 0
    for article_file in file_scraped_articles_list:
        # open as csv file with headers
        article_csv = csv.DictReader(open(article_file, "rb"), delimiter=",")

        for row in article_csv:
            article_id = toUnicode(row["article_id"])
            articleDB[article_id] = toUnicode(row["article_text"])
            loaded_articles += 1
            if(loaded_articles % 10000 == 0):
                common_logger.debug("loaded %d articles", loaded_articles)

    dump(articleDB, path=file_articledb)
    common_logger.info("Dumped ArticleDB successfully")