Esempio n. 1
0
def get_hocr(lang, title):

    # FIXME, delete all no ocr and redo them with nb code lang.
    if lang == 'nb':
        lang = 'no'

    if type(title) == type(u''):
        title = title.encode('utf-8')

    title = title.replace(' ', '_')

    try:
        if lang == 'bn':
            title = unicode(title, 'utf-8')
            page_nr = re.sub(u'^.*/([০-৯]+)$', '\\1', title)
            book_name = re.sub(u'^(.*?)(/[০-৯]+)?$', '\\1', title)
            book_name = book_name.encode('utf-8')
            result = ord(page_nr[0]) - ord(u'০')
            for ch in page_nr[1:]:
                result *= 10
                result += ord(ch) - ord(u'০')
            page_nr = result
        else:
            page_nr = re.sub('^.*/([0-9]+)$', '\\1', title)
            book_name = re.sub('^(.*?)(/[0-9]+)?$', '\\1', title)
            page_nr = int(page_nr)
    except:
        return ret_val(1, "unable to extract page number from page: " + title)

    path = cache_path(book_name, lang)

    filename = path + 'page_%04d.hocr' % page_nr

    # We support data built with different compress scheme than the one
    # actually generated by the server
    text = utils.uncompress_file(filename, ['bzip2', 'gzip', ''])
    if text == None:
        # not available, add a request to do this hocr so we build data
        # lazilly but we filter here unsupported file type
        if book_name.endswith('.djvu') or book_name.endswith('.pdf'):
            import hocr_request
            hocr_request.add_hocr_request(lang, book_name, True)
        return ret_val(
            1, "unable to locate file %s for page %s lang %s" %
            (filename, book_name, lang))

    # work-around https://code.google.com/p/tesseract-ocr/issues/detail?id=690&can=1&q=utf-8 a simple patch exists: https://code.google.com/p/tesseract-ocr/source/detail?r=736# but it's easier to do a double conversion to remove invalid utf8 rather than to maintain a patched version of tesseract.
    text = unicode(text, 'utf-8', 'ignore')
    text = text.encode('utf-8', 'ignore')

    return ret_val(0, text)
Esempio n. 2
0
def get_hocr(lang, title):

    # FIXME, delete all no ocr and redo them with nb code lang.
    if lang == 'nb':
        lang = 'no'

    if type(title) == type(u''):
        title = title.encode('utf-8')

    title = title.replace(' ', '_')

    try:
        if lang == 'bn':
            title = unicode(title, 'utf-8')
            page_nr = re.sub(u'^.*/([০-৯]+)$', '\\1', title)
            book_name = re.sub(u'^(.*?)(/[০-৯]+)?$', '\\1', title)
            book_name = book_name.encode('utf-8')
            result = ord(page_nr[0]) - ord(u'০')
            for ch in page_nr[1:]:
                result *= 10
                result += ord(ch) - ord(u'০')
            page_nr = result
        else:
            page_nr = re.sub('^.*/([0-9]+)$', '\\1', title)
            book_name = re.sub('^(.*?)(/[0-9]+)?$', '\\1', title)
            page_nr = int(page_nr)
    except:
        return ret_val(1, "unable to extract page number from page: " + title)

    path = cache_path(book_name, lang)

    filename = path + 'page_%04d.hocr' % page_nr

    # We support data built with different compress scheme than the one
    # actually generated by the server
    text = utils.uncompress_file(filename, [ 'bzip2', 'gzip', '' ])
    if text == None:
        # not available, add a request to do this hocr so we build data
        # lazilly but we filter here unsupported file type
        if book_name.endswith('.djvu') or book_name.endswith('.pdf'):
            import hocr_request
            hocr_request.add_hocr_request(lang, book_name, True)
        return ret_val(1, "unable to locate file %s for page %s lang %s" % (filename, book_name, lang))

    # work-around https://code.google.com/p/tesseract-ocr/issues/detail?id=690&can=1&q=utf-8 a simple patch exists: https://code.google.com/p/tesseract-ocr/source/detail?r=736# but it's easier to do a double conversion to remove invalid utf8 rather than to maintain a patched version of tesseract.
    text = unicode(text, 'utf-8', 'ignore')
    text = text.encode('utf-8', 'ignore')

    return ret_val(0, text)
Esempio n. 3
0
    def __set_archives_to_analyze(self, mailing_list, archives):
        archives_to_analyze = []

        for archive in archives:
            # Always set Gmane archives to analyze
            if archive.url.find(GMANE_DOMAIN) == -1:
                # Check if already analyzed
                status = self.db.check_compressed_file(archive.url)

                this_month = find_current_month(archive.url)

                # If the file is for the current month, re-import to update.
                # If already visited, ignore it.
                if status == self.db.VISITED and not this_month:
                    self.__print_output('Already analyzed %s' % archive.url)
                    continue

            # If not, set visited
            # (before uncompressing, otherwise the db will point towards
            # the uncompressed temporary file)
            today = datetime.datetime.today().strftime(datetimefmt)
            self.db.set_visited_url(archive.url, mailing_list.location, today,
                                    self.db.NEW)

            if archive.is_compressed():
                try:
                    # Uncompress and get the raw filepaths
                    filepaths = uncompress_file(archive.filepath,
                                                archive.compressed_type,
                                                mailing_list.mbox_dir)
                    uncompressed_mboxes = [
                        MBoxArchive(fp, archive.url) for fp in filepaths
                    ]
                    archives_to_analyze.extend(uncompressed_mboxes)
                except IOError, e:
                    # It could be a plain file, so let's give it a chance
                    self.__print_output(
                        "   ***WARNING: Uncompressing file %s - %s" %
                        (archive.filepath, str(e)))
                    archives_to_analyze.append(archive)
            else:
                archives_to_analyze.append(archive)
Esempio n. 4
0
def _update_log(outputDir, subjectDir, err):

    log_file = get_log_file(outputDir)
    with open(log_file, "wr") as output:

        for i in os.listdir(subjectDir):

            if i == DS_STORE: continue  # CPC2018
            if not utils.is_fasta(i): continue

            fullpath = subjectDir + "/" + i

            if not os.path.isfile(fullpath): continue

            fafile = os.path.abspath(fullpath)
            name = os.path.splitext(i)[0]
            extension = os.path.splitext(i)[1]

            #check for gz files from ncbi
            # TODO: rename "file" vars
            if extension == ".gz":
                fafile = utils.uncompress_file(fafile, subjectDir + "/" + name)

            fafileheader = ""
            with open(fafile, "r") as fafileopen:
                fafileheader = fafileopen.readline().strip()

            #read the files and check for fasta files
            if ">" not in fafileheader[0]:
                raise Exception("ERROR: " + name + " is not a fasta file")
            else:
                target_name = _get_target_name(fafileheader)

                fasta_record = "\t".join([target_name, fafile, name])
                #if VERBOSE: err.write(fasta_record+"\n")

                output.write(fasta_record + "\n")

    return
Esempio n. 5
0
    def __analyze_non_remote(self, dirname):
        """Walk recursively the directory looking for files,
        and uncompress them. Then __analyze_local_directory is called."""

        # Check if directory to stored uncompressed files already exists
        mbox_dir = os.path.join(self.MBOX_DIR, dirname.lstrip('/'))
        if not os.path.exists(mbox_dir):
            os.makedirs(mbox_dir)
        # Compressed files are left in their original location,
        # because they can be uncompressed from that location

        filepaths = []
        for root, dirs, files in os.walk(dirname):
            filepaths += [os.path.join(root, filename) for filename in files]

        # If the file is for the current month (MailMan filename 
        # YYYY-MMM.txt.gz) don't mark as visited, and download again
        # Assuming this is run daily, it's better to take yesterday's date,
        # to ensure we get all of last month's email when the month rolls over.
        yesterday= datetime.datetime.today() + datetime.timedelta(days=-1)
        this_month= yesterday.strftime(mailmanfmt)

        files_to_analyze = {}
        url_list = []
        for filepath in filepaths:

            # Check if already analyzed
            status = self.db.check_compressed_file(filepath)

            # If the file is for the current month, reimport
            current_month = -1 != filepath.find(this_month)
            if current_month:
                self.__print_output('Found substring %s in URL %s...' % \
                                    (this_month, filepath))

            # If already visited, ignore, unless it's for the current month
            if status == self.db.VISITED and not current_month:
                self.__print_output('Already analyzed %s' % filepath)
                continue
            
            # If not, set visited
            # (before uncompressing, otherwise the db will point towards
            # the uncompressed temporary file)
            today = datetime.datetime.today().strftime(datetimefmt)
            self.db.set_visited_url(filepath, dirname, today, self.db.NEW)

            # Check if compressed
            extension = check_compressed_file(filepath)
            if extension:
                # If compressed, uncompress and get the raw filepath
                filepaths = uncompress_file(filepath, extension, mbox_dir)
                # __uncompress_file returns a list containing
                # the path to all the uncompressed files
                # (for instance, a tar file may contain more than one file)
                files_to_analyze.setdefault(filepath, []).extend(filepaths)
            else:
                # File was not uncompressed, so there is only
                # one file to append
                files_to_analyze.setdefault(filepath, []).append(filepath)

            url_list.append(filepath)

        # The archives are usually retrieved in descending
        # chronological order (because the newest archives are always
        # shown on the top of the archives)

        # So we will analyze the list of files in the order inversed
        # to the order in they were retrieved
        url_list.reverse()

        return self.__analyze_list_of_files(dirname, url_list,
                                            files_to_analyze)
Esempio n. 6
0
    def __analyze_remote(self, url):
        """Download the archives from the remote url, stores and parses them."""

        # Check directories to stored the archives
        target = re.sub('^(http|ftp)[s]{0,1}://', '', url)
        compressed_dir = os.path.join(self.COMPRESSED_DIR,
                                            target)
        mbox_dir = os.path.join(self.MBOX_DIR, target)
        if not os.path.exists(compressed_dir):
            os.makedirs(compressed_dir)
        if not os.path.exists(mbox_dir):
            os.makedirs(mbox_dir)

        # If the file is for the current month (MailMan filename 
        # YYYY-MMM.txt.gz) don't mark as visited, and download again
        # Assuming this is run daily, it's better to take yesterday's date,
        # to ensure we get all of last month's email when the month rolls over.
        yesterday = datetime.datetime.today() + datetime.timedelta(days=-1)
        this_month = yesterday.strftime(mailmanfmt)

        # Get all the links listed in the URL
        htmlparser = MyHTMLParser(url, self.web_user, self.web_password)
        links = htmlparser.get_mboxes_links()

        filepaths = []
        for link in links:
            basename = os.path.basename(link)
            destfilename = os.path.join(compressed_dir, basename)

            # If the URL is for the current month, always retrieve.
            # Otherwise, check visited status & local files first
            if link.find(this_month) >= 0:
                self.__print_output('Found substring %s in URL %s...' %
                                    (this_month, link))
                self.__print_output('Retrieving %s...' % link)
                retrieve_remote_file(link, destfilename, self.web_user,
                                     self.web_password)
            elif os.path.exists(destfilename):   # Check if already downloaded
                self.__print_output('Already downloaded %s' % link)
            else:
                self.__print_output('Retrieving %s...' % link)
                retrieve_remote_file(link, destfilename, self.web_user,
                                     self.web_password)

            filepaths.append((link, destfilename))

        files_to_analyze = {}
        url_list = []
        for link, filepath in filepaths:
            # Check if already analyzed
            status = self.db.check_compressed_file(filepath)

            # If the file is for the current month, reimport
            current_month = -1 != filepath.find(this_month)
            if current_month:
                self.__print_output('Found substring %s in URL %s...' % \
                                    (this_month, filepath))

            # If already visited, ignore, unless it's for the current month
            if status == self.db.VISITED and not current_month:
                self.__print_output('Already analyzed %s' % filepath)
                continue
            
            # If not, set visited
            # (before uncompressing, otherwise the db will point towards
            # the uncompressed temporary file)
            today = datetime.datetime.today().strftime(datetimefmt)
            self.db.set_visited_url(link, url, today, self.db.NEW)

            # Check if compressed
            extension = check_compressed_file(filepath)
            if extension:
                # If compressed, uncompress and get the raw filepath
                filepaths = uncompress_file(filepath, extension, mbox_dir)
                # __uncompress_file returns a list containing
                # the path to all the uncompressed files
                # (for instance, a tar file may contain more than one file)
                files_to_analyze.setdefault(link, []).extend(filepaths)
            else:
                # File was not uncompressed, so there is only
                # one file to append
                files_to_analyze.setdefault(link, []).append(filepath)

            url_list.append(link)

        # The archives are usually retrieved in descending
        # chronological order (because the newest archives are always
        # shown on the top of the archives)

        # So we will analyze the list of files in the order inversed
        # to the order in they were retrieved
        url_list.reverse()

        return self.__analyze_list_of_files(url, url_list, files_to_analyze)