Example #1
0
    def merge():
        fid = open(CookieExtractor.raw_cache_name, 'r')
        cookie_tuple = Extractor.read_line(fid)

        cookie_info_dict = dict()

        while cookie_tuple is not None:
            # 统计该 IP 的所有点击的 cookie 以及其数量
            for ip in cookie_tuple:
                cookie = cookie_tuple[ip][CookieExtractor.cookie_index]
                if ip in cookie_info_dict.keys():
                    cookie_info = cookie_info_dict[ip]
                    cookie_num = cookie_info[CookieExtractor.cookie_num_index]
                    if cookie in cookie_num.kyes():
                        cookie_num[cookie] += 1
                    else:
                        cookie_num[cookie] = 1
                else:
                    cookie_info = list()
                    cookie_num = dict()
                    cookie_info.append(cookie_num)
                    cookie_num[cookie] = 1
                    cookie_info_dict[ip] = cookie_info

            # next line
            cookie_tuple = Extractor.read_line(fid)

        fid = open(CookieExtractor.cache_name, 'w')
        fid.writelines(json.dumps(cookie_info_dict))
Example #2
0
    def merge():
        fid = open(TagExtractor.raw_cache_name, 'r')
        tag_tuple = Extractor.read_line(fid)

        tag_info_dict = dict()

        while tag_tuple is not None:
            for ip in tag_tuple:
                tag = tag_tuple[ip][TagExtractor.tag_index]
                if ip in tag_info_dict.keys():
                    tag_info = tag_info_dict[ip]
                    tag_num_info = tag_info[TagExtractor.tag_num_info_index]
                    if tag in tag_num_info.keys():
                        tag_num_info[tag] += 1
                    else:
                        tag_num_info[tag] = 1
                else:
                    tag_info = list()
                    tag_num_info = dict()
                    tag_info.append(tag_num_info)
                    tag_num_info[tag] = 1
                    tag_info_dict[ip] = tag_info

            tag_tuple = Extractor.read_line(fid)

        fid = open(TagExtractor.cache_name, 'w')
        fid.writelines(json.dumps(tag_info_dict))
Example #3
0
def clean_markup(markup, keep_links=False, ignore_headers=True):
    """
    Clean Wikimarkup to produce plaintext.

    :param keep_links: Set to True to keep internal and external links
    :param ignore_headers: if set to True, the output list will not contain
    headers, only 

    Returns a list of paragraphs (unicode strings).
    """
    keyword_filter = [
        '<span style="background', 'Chào mừng bạn', 'Xin chào bạn',
        'Mời bạn tham khảo', 'Để chứng tỏ được độ nổi bật của đề tài',
        'Bạn cũng có thể đóng góp', 'Khi thảo luận, bạn', 'Còn thắc mắc? Ghé',
        '## ', ' Hoan nghênh tham gia Wikipedia', 'Wiki chính thức',
        'Tính năng:', ' <font color', 'Tiêu chuẩn bài viết'
    ]
    if not keep_links:
        ignoreTag('a')

    extractor = Extractor(0, '', [])

    # returns a list of strings
    paragraphs = extractor.clean_text(markup,
                                      mark_headers=True,
                                      expand_templates=False,
                                      escape_doc=True)
    resetIgnoredTags()

    if ignore_headers:
        for k in keyword_filter:
            paragraphs = list(filter(lambda s: not s.startswith(k),
                                     paragraphs))

    return paragraphs
Example #4
0
    def extract(line):
        model = RawData(line)
        key = model.ip
        fields = list()

        fields.append(model.placement_id)
        Extractor.solid(key, fields, PlacementExtractor.fid)
Example #5
0
    def merge():
        fid = open(DevExtractor.raw_cache_name, 'r')
        dev_tuple = Extractor.read_line(fid)

        dev_info_dict = dict()

        while dev_tuple is not None:
            for ip in dev_tuple:
                dev = dev_tuple[ip][DevExtractor.dev_id_index]
                if ip in dev_info_dict.keys():
                    dev_info = dev_info_dict[ip]
                    dev_num_info = dev_info[DevExtractor.dev_num_info_index]
                    if dev in dev_num_info.keys():
                        dev_num_info[dev] += 1
                    else:
                        dev_num_info[dev] = 1
                else:
                    dev_info = list()
                    dev_num_info = dict()
                    dev_info.append(dev_num_info)
                    dev_num_info[dev] = 1
                    dev_info_dict[ip] = dev_info

            # next line
            dev_tuple = Extractor.read_line(fid)

        fid = open(DevExtractor.cache_name, 'w')
        fid.writelines(json.dumps(dev_info_dict))
Example #6
0
    def extract(line):
        model = RawData(line)
        key = model.ip
        fields = list()

        fields.append(model.user_agent)
        Extractor.solid(key, fields, UAExtractor.fid)
Example #7
0
 def __init__(self, json_path, text_dir, db_url):
     self.json_path = json_path
     self.text_dir = text_dir
     self.db_url = db_url
     if not isdir(self.text_dir):
         mkdir(self.text_dir)
     self.extractor = Extractor(text_dir)
     self.manager = Manager(db_url)
Example #8
0
    def extract(line):
        model = RawData(line)
        key = model.ip
        fields = list()

        # cookie
        fields.append(model.cookie)
        Extractor.solid(key, fields, CookieExtractor.fid)
Example #9
0
    def extract(line):
        model = RawData(line)
        key = model.ip

        # SECOND_TAG
        fields = list()
        fields.append(Context.media_info_table[key][2])

        Extractor.solid(key, fields, TagExtractor.fid)
Example #10
0
 def __init__(self):
     self.__data = Extractor().extract_csv_data()
     self.__loader = Loader()
     # save all extracted DataFrames from csv files to parquet files
     for k, v in self.__data.items():
         self.__loader.save_to_parquet(k, v)
     # reads all saved parquet files
     data_files = self.__loader.read_parquets("weather")
     # combines all DataFrames into one to get the highest temp from all records
     self.__df = pd.concat(data_files, ignore_index=True)
Example #11
0
    def extract(line):
        model = RawData(line)
        key = model.ip

        # DEVICE ID
        fields = list()
        fields.append(model.mobile_imei + model.mobile_idfa +
                      model.mobile_android_id + model.mobile_mac)

        Extractor.solid(key, fields, DevExtractor.fid)
Example #12
0
 def __init__(self, text_dir, db_url, book_url, should_download=False):
     """
     ``text_dir`` is the directory where a copy of text should be put.
     ``db_url`` should be the url to a database that already exists.
     ``should_download`` indicates whether or not ``book_url`` is a local
     path or a url in the internet.
     """
     self.text_dir = text_dir
     self.db_url = db_url
     self.book_url = book_url
     self.should_download = should_download
     self.manager = Manager(db_url)
     self.extractor = Extractor(text_dir)
Example #13
0
def classify(k, classifier_path, image):
    im = Image.open(image)
    e = Extractor(im)

    start = datetime.datetime.now()
    print '#' * 37 + ' DATA ' + '#' * 37
    for s_im in e:
        if (Extractor.is_whitespace(s_im)):
            print s_im,
        else:
            classify_digit(k, classifier_path, s_im)
    print '#' * 80
    end = datetime.datetime.now()
    print '### TIME: {} sec###'.format((end - start).total_seconds())
Example #14
0
class Transformer:
    def __init__(self):
        self.__data = Extractor().extract_csv_data()
        self.__loader = Loader()
        # save all extracted DataFrames from csv files to parquet files
        for k, v in self.__data.items():
            self.__loader.save_to_parquet(k, v)
        # reads all saved parquet files
        data_files = self.__loader.read_parquets("weather")
        # combines all DataFrames into one to get the highest temp from all records
        self.__df = pd.concat(data_files, ignore_index=True)

    def find_hottest_day(self):
        """
        Gets a subset of the combined data containing only the columns we need.
        Then finds the row that equals to the maximum ScreenTemperature and returns it.
        :return: a DataFrame row containing the result of the query.
        """

        # creates a subset of the data with only the columns we need
        df_subset = self.__df[[
            'ObservationDate', 'ScreenTemperature', 'Region'
        ]]
        # find the row with max temperature
        return df_subset[df_subset['ScreenTemperature'] ==
                         df_subset['ScreenTemperature'].max()]
Example #15
0
    def merge():
        fid = open(PlacementExtractor.raw_cache_name, 'r')
        placement_tuple = Extractor.read_line(fid)

        placement_info_dict = dict()

        while placement_tuple is not None:
            for ip in placement_tuple:
                if placement_info_dict.has_key(ip):
                    placement_info = placement_info_dict[ip]
                    placement_num = placement_info[
                        PlacementExtractor.placement_num_index]
                    if placement_num.has_key(ip):
                        placement_num[ip] += 1
                    else:
                        placement_num = 1
                else:
                    placement_info = list()
                    placement_num = dict()
                    placement_info.append(placement_num)
                    placement_num[tuple[ip]] = 1
                    placement_info_dict[ip] = placement_info

        fid = open(PlacementExtractor.cache_name, 'w')
        fid.writelines(json.dumps(placement_info_dict))
Example #16
0
async def extract(request):
    """
    Returns the text contained in the image.
    """

    body = request.files['file'][0].body
    text = Extractor.extract(body)
    return response.text(text)
Example #17
0
 def __init__(self, json_path, text_dir, db_url):
     self.json_path = json_path
     self.text_dir = text_dir
     self.db_url = db_url
     if not isdir(self.text_dir):
         mkdir(self.text_dir)
     self.extractor = Extractor(text_dir)
     self.manager = Manager(db_url)
Example #18
0
class TestExtractor(TestCase):
    def setUp(self):
        self.extractor = Extractor()

    def test_extract_csv_data(self):
        result = self.extractor.extract_csv_data()
        self.assertCountEqual(['weather.20160201.csv', 'weather.20160301.csv'],
                              result.keys())
Example #19
0
def main(inputdirs, opts=None, **extraOpts):
    information = None
    
    if not opts and extraOpts:
        # extraOpts is only here for convenience if you want to specify options as keyword arguements
        # It requires you haven't specified opts
        opts = extraOpts
    
    if opts and len(inputdirs) > 0:
        extractor = Extractor( inputdirs
            , infoKls    = Everything
            , tempDir    = opts.get("tempDir")
            , stateKls   = State
            , parserKls  = Parser
            , extension  = opts.get("extension", "js")
            )
            
        information = extractor.generate()
        
        information[PROJECT] = opts.get("project", None)
        information[VERSION] = opts.get("version", None)
        
        information[PROJECT_URL]   = opts.get("projectUrl", None)
        information[COPYRIGHT_TAG] = opts.get("copyrightTag", None)
        
        templatedirs = opts.get("templateDirs", [])
        defaultTemplates = os.path.join(here, "templates")
        if defaultTemplates not in templatedirs:
            templatedirs.insert(0, defaultTemplates)
        
        gen = Generator(
              tempdir  = opts.get("tempDir")
            , outDir   = opts.get("outDir")
            , assetDirs    = opts.get("assetDirs", None)
            , showPrivate  = opts.get("showPrivate", True)
            , templateDirs = templatedirs
            )
        
        gen.process(information)

    else:

        optparser.error("Incorrect number of arguments")
    
    return information
Example #20
0
def main(inputdirs, opts=None, **extraOpts):
    information = None

    if not opts and extraOpts:
        # extraOpts is only here for convenience if you want to specify options as keyword arguements
        # It requires you haven't specified opts
        opts = extraOpts

    if opts and len(inputdirs) > 0:
        extractor = Extractor(inputdirs,
                              infoKls=Everything,
                              tempDir=opts.get("tempDir"),
                              stateKls=State,
                              parserKls=Parser,
                              extension=opts.get("extension", "js"))

        information = extractor.generate()

        information[PROJECT] = opts.get("project", None)
        information[VERSION] = opts.get("version", None)

        information[PROJECT_URL] = opts.get("projectUrl", None)
        information[COPYRIGHT_TAG] = opts.get("copyrightTag", None)

        templatedirs = opts.get("templateDirs", [])
        defaultTemplates = os.path.join(here, "templates")
        if defaultTemplates not in templatedirs:
            templatedirs.insert(0, defaultTemplates)

        gen = Generator(tempdir=opts.get("tempDir"),
                        outDir=opts.get("outDir"),
                        assetDirs=opts.get("assetDirs", None),
                        showPrivate=opts.get("showPrivate", True),
                        templateDirs=templatedirs)

        gen.process(information)

    else:

        optparser.error("Incorrect number of arguments")

    return information
Example #21
0
def train(classifier_path, image_path):
    """Train the classifier with an image.

    This method extract digits from the image, then prompts the user
    to classify the image. That classification is stored in the following format:

    Digit: [classification]
    File: [image taken from]
    [PIXEL DATA]
    @

    Args:
        classifier_path (String): path to the file holding the classifier data
        image_path (String): path to the image to train on
    """

    # open the classifier file
    with open(classifier_path, "a+") as f:

        # open the image
        im_name = os.path.basename(image_path)
        im = Image.open(image_path)

        # create an Extractor
        e = Extractor(im)

        # iterate over the SimpleImages extracted
        for s_im in e:
            # chack that it isn't whitespace
            if (not Extractor.is_whitespace(s_im)):

                # print the image and have the user classify it
                print s_im
                digit = input("Input value ('s' to skip): ")

                # skip this digit
                if (digit == 's'):
                    continue

                # write the data to the file
                f.write(CLASSIFIER_FORMAT.format(digit, im_name, str(s_im)))
Example #22
0
 def __init__(self, text_dir, db_url, book_url, should_download=False):
     """
     ``text_dir`` is the directory where a copy of text should be put.
     ``db_url`` should be the url to a database that already exists.
     ``should_download`` indicates whether or not ``book_url`` is a local
     path or a url in the internet.
     """
     self.text_dir = text_dir
     self.db_url = db_url
     self.book_url = book_url
     self.should_download = should_download
     self.manager = Manager(db_url)
     self.extractor = Extractor(text_dir)
Example #23
0
 def update(self, rsession):
     url = self.config['url']
     page_limit = int(self.config['page_limit'])
     page_count = 1
     posting_urls = []
     e = Extractor(self.config['fields'])
     while None != url:
         # download listing
         r = rsession.get(url)
         for c in r.cookies:
             rsession.cookies.set_cookie(c)
         clean = UnicodeDammit(r.content)
         content = clean.unicode_markup.encode('utf-8')
         print 'DL: {0}: {1} [{2}], {3} bytes'.format(url, r.status_code, clean.original_encoding, len(content))
         # pull information from html
         result = e.extract(content)
         # get url of next page
         url = result['next_page_url']
         if None == url:
             break
         # make sure the url is absolute
         if 'http' not in url:
             url = self.config['base_url'] + url
         # get all new posting urls
         new_urls = 0
         for posting_url in result['posting_url']:
             if 'http' not in posting_url:
                 posting_url = self.config['base_url'] + posting_url;
             # save any previously unseen urls
             if not Url.query.filter(Url.value == posting_url).count() > 0:
                 new_urls += 1
                 posting_urls.append(Url(value=posting_url, status='listed'))
         session.commit()
         page_count += 1
         # break if page limit reached, or page full of known urls
         if 0 == new_urls or page_count > page_limit:
             break
     print '{0} new urls found'.format(new_urls)
Example #24
0
def extract_process(jobs_queue, output_queue):
    """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
    :param jobs_queue: where to get jobs.
    :param output_queue: where to queue extracted text for output.
    """
    while True:
        job = jobs_queue.get()  # job is (id, title, page, ordinal)
        if job:
            out = BytesIO()  # memory buffer
            Extractor(*job[:3]).extract(out)  # (id, title, page)
            text = out.getvalue()
            output_queue.put((job[3], text))  # (ordinal, extracted_text)
            out.close()
        else:
            break
Example #25
0
def extract_process(jobs_queue, output_queue, html_safe):
    """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
    :param jobs_queue: where to get jobs.
    :param output_queue: where to queue extracted text for output.
    :html_safe: whether to convert entities in text to HTML.
    """
    while True:
        job = jobs_queue.get()  # job is (id, revid, urlbase, title, page, ordinal)
        if job:
            out = StringIO()  # memory buffer
            Extractor(*job[:-1]).extract(out, html_safe)  # (id, urlbase, title, page)
            text = out.getvalue()
            output_queue.put((job[-1], text))  # (ordinal, extracted_text)
            out.close()
        else:
            break
Example #26
0
def main():
    base = 'https://www.owler.com'
    # path = '/sector/industrial-machinery-equipment-companies'
    # path = '/industry/industrial-goods-services-companies'
    path = '/industry/industrial-goods-services-companies?p=1319'
    url = base + path
    driver = webdriver.Firefox()
    time.sleep(7)
    wait = WebDriverWait(driver, 20)
    driver.get(url)
    time.sleep(10)
    # with open('sample.txt', 'r') as f:  # Mocked
    #     sHtml = f.read()  # Mocked
    resultsInfo = Extractor(driver.page_source)
    sdf = resultsInfo.getData()
    # writeData(sdf, 'biggertest')  # Mocked
    writeData(sdf, 'companies')
    n = resultsInfo.nResults()
    print(n, 'this is main N')
    for i in range(5, 0, -1):
        time.sleep(1)
        print('%s seconds - Crawl will begin' % (i))
    # for v in range(2, (int(n/15)+1)):
    for v in range(1320, (int(n / 15) + 1)):
        randomPause = random.randint(8, 13)
        for i in range(randomPause, 0, -1):
            time.sleep(1)
            # print('%s seconds - Next page will begin' % (i))
        wait = WebDriverWait(driver, 20)
        wait.until(
            EC.visibility_of_element_located((By.XPATH, '//*[@id="next-15"]')))
        driver.find_element_by_xpath('//*[@id="next-15"]').click()
        html = driver.page_source
        info = Extractor(html)
        df = info.getData()
        # writeData(df, 'biggertest')  # Mocked
        writeData(df, 'companies')
        print('Page %s of %s' % (v, int(n / 15)))
        if info.title() == 'Pardon Our Interruption':
            print('wait: %s, p: %s of %s' %
                  (randomPause, v, str(int(n / 15) + 1)))
            print(datetime.datetime.now())
            driver.quit()
            raise SystemExit('They\'re onto us! Ghost out!')
    driver.quit()
Example #27
0
def main():
    if 'linux' in sys.platform:
        # start xvfb in case no X is running. Make sure xvfb
        # is installed, otherwise this won't work!
        dryscrape.start_xvfb()
    # sPage = requests.get(startUrl)
    # sHtml = sPage.text
    # sPage.raise_for_status()
    sess = dryscrape.Session(base_url='https://www.owler.com')
    sess.set_attribute('auto_load_images', False)
    sess.visit('/sector/industrial-machinery-equipment-companies')
    print(sess.status_code(), sess.headers())
    sHtml = sess.body()
    # with open('sample.txt', 'r') as f:  # Mocked
    #     sHtml = f.read()  # Mocked
    resultsInfo = Extractor(sHtml)
    sdf = resultsInfo.getData()
    print(type(sdf))
    # writeData(sdf, 'companies')
    writeData(sdf, 'runcompanies')  # Mocked
    n = resultsInfo.nResults()
    for i in range(5, 0, -1):
        time.sleep(1)
        print('%s seconds - Next page will begin' % (i))
    for v in range(2, int(n / 15)):
        nextone = '/sector/industrial-machinery-equipment-companies?p=%s' % (v)
        print(nextone)
        # page = requests.get(nextpage)
        # page.raise_for_status()
        # html = page.text
        sess.visit(nextone)
        print(sess.status_code(), sess.headers())
        html = sess.body()
        info = Extractor(html)
        # info = Extractor(sHtml)  # Mocked
        df = info.getData()
        # writeData(df, 'companies')
        writeData(df, 'runcompanies')  # Mocked
        for i in range(20, 0, -1):
            time.sleep(1)
            print('%s seconds - Next page will begin' % (i))
Example #28
0
def main():
    args = parser.parse_args()
    extractor = Extractor()
    extractor.top = args.top
    extractor.bottom = args.bottom
    extractor.left = args.left
    extractor.right = args.right
    file_name = args.video_file
    video = cv2.VideoCapture(file_name)
    fps = video.get(cv2.CAP_PROP_FPS)
    length = video.get(cv2.CAP_PROP_FRAME_COUNT)
    interval_sec = INTERVAL / fps
    success = True
    index = 0
    characters = 0
    last_text = ''
    last_sec = 0
    while success:
        success, frame = video.read()
        total_sec = index / fps
        if index % INTERVAL == 0 and total_sec >= last_sec:
            text = extractor.extract(frame)
            characters += len(text)
            if len(text) == 0:
                dur = total_sec - last_sec
                output_subtitle(last_sec, last_text, dur)
            else:
                if similarity(text, last_text) > 0.6:
                    dur = interval_sec / 2
                else:
                    dur = len(text) * TIME_PER_CHAR
                output_subtitle(last_sec, text, dur)
            msg("%d%% Processed, %d Characters Scanned" % (floor(
                (index / length) * 100), characters))
            last_sec = last_sec + dur
            last_text = text
        index += 1
Example #29
0
    def merge():
        fid = open(UAExtractor.raw_cache_name, 'r')
        tuple = Extractor.read_line(fid)

        ua_info_dict = dict()

        while tuple != None:
            for ip in tuple:
                if ua_info_dict.has_key(ip):
                    ua_info = ua_info_dict[ip]
                    ua_num = ua_info[UAExtractor.ua_num_index]
                    if ua_num.has_key(ip):
                        ua_num[ip] += 1
                    else:
                        ua_num = 1
                else:
                    ua_info = list()
                    ua_num = dict()
                    ua_info.append(ua_num)
                    ua_num[tuple[ip]] = 1
                    ua_info_dict[ip] = ua_info

        fid = open(UAExtractor.cache_name, 'w')
        fid.writelines(json.dumps(ua_info_dict))
Example #30
0
 def get_dev_num_info(ip):
     dev_info_dict = Extractor.read_cache(DevExtractor.cache_name)
     dev_num_info = dev_info_dict[ip][DevExtractor.dev_num_info_index]
     return dev_num_info
Example #31
0
class Query(object):

    def __init__(self, text_dir, db_url, book_url, should_download=False):
        """
        ``text_dir`` is the directory where a copy of text should be put.
        ``db_url`` should be the url to a database that already exists.
        ``should_download`` indicates whether or not ``book_url`` is a local
        path or a url in the internet.
        """
        self.text_dir = text_dir
        self.db_url = db_url
        self.book_url = book_url
        self.should_download = should_download
        self.manager = Manager(db_url)
        self.extractor = Extractor(text_dir)

    def __enter__(self):
        self.run()
        return self

    def __exit__(self, type, value, traceback):
        self.clean_up()

    def run(self):
        word_rates = self._word_rates()
        word_categories = self._word_categories(word_rates)
        wcp = self._word_conditional_probabilities(word_categories)
        e, r = self._probabilities(wcp)
        self.elizabethan_factor = e
        self.romantic_factor = r

    def results(self):
        """
        Returns a tuple (e, r) with the factor that this book be Elizabethan
        or Romantic respectively.
        """
        return self.elizabethan_factor, self.romantic_factor


    def clean_up(self):
        if self.should_download:
            os.remove(self.filename)
        
    def _word_rates(self):
        """
        Downloads the book if needed, or makes a copy of it.
        Returns a dictionary of words and their rates.
        """
        if self.should_download:
            self.filename = self.extractor.download_book(self.book_url, True)
        else:
            self.filename = self.book_url
        word_rates = self.extractor.read_text(self.filename)
        self.word_rates = word_rates
        return word_rates

    def _word_categories(self, word_rates):
        """
        For every word in the database returns a dictionary of word->category
        according to the rates in the books.
        Returns an iterable of WordCategory for the category of every word that
        is both in the book and the database, returns the WordCategory with
        lowest category for words in the database that did not appear in the
        book.
        """
        total_words = reduce(lambda x, y: x + y, word_rates.itervalues())
        rates = {word: (float(count) / total_words)
                 for word, count in word_rates.iteritems()}
        words_not_in_book = self.manager.session.query(Word.text).all()
        words_not_in_book = set(words_not_in_book) - set(rates.keys())
        words_not_in_book = list(words_not_in_book)

        low = self.manager.session.query(Category).\
            filter(Category.description == 'low').one()
        word_count_query = self.manager.session.query(WordCategory)
        for lst in dict_key_slice(rates, MAX_SLICE_SIZE):
            words = self.manager.session.query(Word).\
                filter(Word.text.in_(lst)).all()
            for word in words:
                rate = rates.get(word.text)
                word_count = word_count_query.filter(WordCategory.id_word == word.id).\
                    filter(WordCategory.min_range <= rate).\
                    filter(WordCategory.max_range > rate).one()
                yield word_count

        for lst in list_slices(map(lambda i: i[0], words_not_in_book), MAX_SLICE_SIZE):
            word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\
                filter(WordCategory.id_category == low.id).all()
            for word_count in word_count_data:
                yield word_count

        
    def _word_conditional_probability(self, word_id, category_id, period_id):
        """
        Returns an instace of WordConditionalProbability.
        """
        p = self.manager.session.query(WordConditionalProbability)
        p = p.filter_by(id_word=word_id, id_category=category_id,
            id_period=period_id)
        p = p.one()
        return p
    
    def _word_conditional_probabilities(self, word_categories):
        """
        Receives an iterable of WordCategory objects.
        Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the
        probabilities that the word and category be in Elizabethan and Romantic
        periods respectively.
        """
        elizabethan = self.manager.elizabethan_period
        romantic = self.manager.romantic_period

        for wc in word_categories:
            word_id = wc.id_word
            category_id = wc.id_category
            e = self._word_conditional_probability(word_id, category_id,
                elizabethan.id).probability
            r = self._word_conditional_probability(word_id, category_id,
                romantic.id).probability
            yield e, r

    def _probabilities(self, conditional_probabilities):
        """
        Receives an iterable as returned by
        ``_word_conditional_probabilities``.
        
        Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan
        or Romantic respectively.
        """
        elizabethan_book_count = self.manager.elizabethan_book_count
        romantic_book_count = self.manager.romantic_book_count
        total_books = elizabethan_book_count + romantic_book_count
        elizabethan_probability = float(elizabethan_book_count) / total_books
        romantic_probability = float(romantic_book_count) / total_books
        elizabethan_factor =  elizabethan_probability
        romantic_factor =  romantic_probability
        x = 0
        for e, r in conditional_probabilities:
            if e != 0 and r != 0:
                # elizabethan_factor *= 10 * e * elizabethan_probability
                # romantic_factor *= 10 * r * romantic_probability
                if e < 0.1 or r < 0.1:
                    elizabethan_factor *=  100 * e
                    romantic_factor *= 100 * r
                else:
                    elizabethan_factor *=  e
                    romantic_factor *= r

                if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float('Inf') or romantic_factor == float('Inf'):
                    return buffer_elizabethan, buffer_romantic

                buffer_elizabethan = elizabethan_factor
                buffer_romantic = romantic_factor
                # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) )
        return elizabethan_factor, romantic_factor

    def top(self, count):
        ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1])
        return ordered[0:count]
Example #32
0
class Query(object):
    def __init__(self, text_dir, db_url, book_url, should_download=False):
        """
        ``text_dir`` is the directory where a copy of text should be put.
        ``db_url`` should be the url to a database that already exists.
        ``should_download`` indicates whether or not ``book_url`` is a local
        path or a url in the internet.
        """
        self.text_dir = text_dir
        self.db_url = db_url
        self.book_url = book_url
        self.should_download = should_download
        self.manager = Manager(db_url)
        self.extractor = Extractor(text_dir)

    def __enter__(self):
        self.run()
        return self

    def __exit__(self, type, value, traceback):
        self.clean_up()

    def run(self):
        word_rates = self._word_rates()
        word_categories = self._word_categories(word_rates)
        wcp = self._word_conditional_probabilities(word_categories)
        e, r = self._probabilities(wcp)
        self.elizabethan_factor = e
        self.romantic_factor = r

    def results(self):
        """
        Returns a tuple (e, r) with the factor that this book be Elizabethan
        or Romantic respectively.
        """
        return self.elizabethan_factor, self.romantic_factor

    def clean_up(self):
        if self.should_download:
            os.remove(self.filename)

    def _word_rates(self):
        """
        Downloads the book if needed, or makes a copy of it.
        Returns a dictionary of words and their rates.
        """
        if self.should_download:
            self.filename = self.extractor.download_book(self.book_url, True)
        else:
            self.filename = self.book_url
        word_rates = self.extractor.read_text(self.filename)
        self.word_rates = word_rates
        return word_rates

    def _word_categories(self, word_rates):
        """
        For every word in the database returns a dictionary of word->category
        according to the rates in the books.
        Returns an iterable of WordCategory for the category of every word that
        is both in the book and the database, returns the WordCategory with
        lowest category for words in the database that did not appear in the
        book.
        """
        total_words = reduce(lambda x, y: x + y, word_rates.itervalues())
        rates = {
            word: (float(count) / total_words)
            for word, count in word_rates.iteritems()
        }
        words_not_in_book = self.manager.session.query(Word.text).all()
        words_not_in_book = set(words_not_in_book) - set(rates.keys())
        words_not_in_book = list(words_not_in_book)

        low = self.manager.session.query(Category).\
            filter(Category.description == 'low').one()
        word_count_query = self.manager.session.query(WordCategory)
        for lst in dict_key_slice(rates, MAX_SLICE_SIZE):
            words = self.manager.session.query(Word).\
                filter(Word.text.in_(lst)).all()
            for word in words:
                rate = rates.get(word.text)
                word_count = word_count_query.filter(WordCategory.id_word == word.id).\
                    filter(WordCategory.min_range <= rate).\
                    filter(WordCategory.max_range > rate).one()
                yield word_count

        for lst in list_slices(map(lambda i: i[0], words_not_in_book),
                               MAX_SLICE_SIZE):
            word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\
                filter(WordCategory.id_category == low.id).all()
            for word_count in word_count_data:
                yield word_count

    def _word_conditional_probability(self, word_id, category_id, period_id):
        """
        Returns an instace of WordConditionalProbability.
        """
        p = self.manager.session.query(WordConditionalProbability)
        p = p.filter_by(id_word=word_id,
                        id_category=category_id,
                        id_period=period_id)
        p = p.one()
        return p

    def _word_conditional_probabilities(self, word_categories):
        """
        Receives an iterable of WordCategory objects.
        Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the
        probabilities that the word and category be in Elizabethan and Romantic
        periods respectively.
        """
        elizabethan = self.manager.elizabethan_period
        romantic = self.manager.romantic_period

        for wc in word_categories:
            word_id = wc.id_word
            category_id = wc.id_category
            e = self._word_conditional_probability(word_id, category_id,
                                                   elizabethan.id).probability
            r = self._word_conditional_probability(word_id, category_id,
                                                   romantic.id).probability
            yield e, r

    def _probabilities(self, conditional_probabilities):
        """
        Receives an iterable as returned by
        ``_word_conditional_probabilities``.
        
        Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan
        or Romantic respectively.
        """
        elizabethan_book_count = self.manager.elizabethan_book_count
        romantic_book_count = self.manager.romantic_book_count
        total_books = elizabethan_book_count + romantic_book_count
        elizabethan_probability = float(elizabethan_book_count) / total_books
        romantic_probability = float(romantic_book_count) / total_books
        elizabethan_factor = elizabethan_probability
        romantic_factor = romantic_probability
        x = 0
        for e, r in conditional_probabilities:
            if e != 0 and r != 0:
                # elizabethan_factor *= 10 * e * elizabethan_probability
                # romantic_factor *= 10 * r * romantic_probability
                if e < 0.1 or r < 0.1:
                    elizabethan_factor *= 100 * e
                    romantic_factor *= 100 * r
                else:
                    elizabethan_factor *= e
                    romantic_factor *= r

                if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float(
                        'Inf') or romantic_factor == float('Inf'):
                    return buffer_elizabethan, buffer_romantic

                buffer_elizabethan = elizabethan_factor
                buffer_romantic = romantic_factor
                # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) )
        return elizabethan_factor, romantic_factor

    def top(self, count):
        ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1])
        return ordered[0:count]
Example #33
0
import json

from extract import Extractor
from transform import Transformer
## Modify boolean as needed, if testing directly from static JSON blobs or performing execution of SOQL queries:
full_soql_query_mode = False
## SFDC API AUTH:
# auth is a separate Python module as a placeholder to store SFDC creds. Ref required params as follows:
# from simple_salesforce import Salesforce, SalesforceLogin
# sf = Salesforce(username='******', password='******', security_token='token', client_id='Testing', \
# instance_url='https://zayo.my.salesforce.com', session_id='')
if full_soql_query_mode == True:
    from auth import sf
    from load import Loader

    e = Extractor()
    opp_query = e.get_opp_info()
    opp_output = sf.query_all(opp_query)
    records_only = opp_output['records']
    size = opp_output['totalSize']
    formatted_opp_ids = e.format_opp_ids(opp_output, size)

    npv_task_query = e.get_npv_task_info(formatted_opp_ids)
    npv_task_output = sf.query_all(npv_task_query)

    service_order_query = e.get_so_info(formatted_opp_ids)
    service_order_output = sf.query_all(service_order_query)

    cap_proj_query = e.get_capital_project_info(formatted_opp_ids)
    cap_proj_output = sf.query_all(cap_proj_query)
Example #34
0
      contents =f.read()
      #print contents

      words = contents.split()
      for word in words:
        wordID = 0
        for i,d in enumerate(dictionary):
          if d[0] == word:
            wordID = i
            features_matrix[0,wordID] = words.count(word)
      
    return features_matrix

test_doc = 'travel-nontravel/tr2.txt'
doc_matrix = extract_features_for_single_doc(test_doc)
extractor = Extractor()
result3 = model1.predict(doc_matrix)
if result3==0:
	print "non travel"
else:
	print "travel"
print str(result3)+"\n"
if result3==1:
	extractor.setPath(test_doc)
	user_name = extractor.findUserName()#emailid
	date = extractor.findDate()
	time = extractor.findTime()
	address = extractor.findAddress()	
	print date
	print time
	print address
Example #35
0
class Trainer(object):

    def __init__(self, json_path, text_dir, db_url):
        self.json_path = json_path
        self.text_dir = text_dir
        self.db_url = db_url
        if not isdir(self.text_dir):
            mkdir(self.text_dir)
        self.extractor = Extractor(text_dir)
        self.manager = Manager(db_url)

    def json(self):
        if not hasattr(self, "_json"):
            _json = []
            texts = {}
            with open(self.json_path, "r") as f:
                texts = json.load(f)
            for text in texts:
                author = text["Author"]
                title = text["Title"]
                period = text["Period"]
                url = text["URL"]
                _json.append((author, title, period, url))
        return _json

    def get_books(self):
        """
        Downloads the book if it's not in the texts directory.
        """
        files = [f for f in listdir(self.text_dir)]
        for author, title, period, url in self.json():
            filename = format_filename(author, title)
            if not filename in files:
                logger.debug("Getting %s" % filename)
                book = self.extractor.download_book(url, False, author, title, period)
            else:
                logger.debug("%s already downloaded" % filename)

    def train(self):
        logger.debug("      STARTING get_books")
        self.get_books()
        logger.debug("      STARTING populate")
        self.populate()
        logger.debug("      STARTING categories")
        self.categories()
        logger.debug("      STARTING conditional_probability")
        self.conditional_probability()
        self.manager.session.close_all()    

    def populate(self):
        output = []
        for author, title, period, url in self.json():
            # TODO clean the next line
            words = self.extractor.read_text(format_filename(author, title))
            if len(words) == 0:
                continue
            total_words = reduce(operator.add, words.values())
            #insert period
            dic_period = {'name':period}
            list_search = ['name']
            period_obj = self.manager.get_or_insert(dict_val=dic_period,
                instance=models.Period, list_search=list_search)
            #insert book
            # logger.debug(words)
            logger.debug("Total Words: %s", total_words)
            dic_book = {'name':title,
                'author':author,
                'period':period_obj,
                'total_words':total_words,
                'sentence_total':0}
            list_search = ['name','author','period']
            book_obj = self.manager.get_or_insert(dict_val=dic_book,
                instance=models.Book,list_search=list_search)
            #Words
            filename = format_filename(author, title)
            
            if len(words) == 0:
                continue

            logger.debug("Period id : %s %s" % (period_obj.id,period_obj.name))
            logger.debug("Book id : %s %s %s" % (book_obj.id,book_obj.name,book_obj.author))
            self.manager.insert_words(words,book_obj,total_words)

    def categories(self):
        words_all = self.manager.get({},Word,[],True)
        total = len(words_all)
        logger.debug("  categories Words %s" % total)
        for word_obj in words_all:
            self.calculate_categories(word_obj=word_obj)
            total -= 1
            if total % 500 ==0:
                logger.debug("Progressing Word -- Category... %s" % total)
        self.manager.session.commit()

    def calculate_categories(self, word_obj=None):
        if not word_obj:
            return False
        max_rate, min_rate = self.manager.get_max_min_rate(word_obj)
        self.manager.construct_categories(min_rate,max_rate, word_obj)


    def period_probability(self, period, log=False):
        """
        # libros de esa epoca
        ---
        # total de libros
        """
        books_period = self.manager.session.query(Book).filter_by(period=period).count()
        if log:
            logger.debug("      books_period = %f " % (books_period))
        return books_period


    def word_category_period_probability(self, word, category, period, log=False):
        """
        cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria
        ---
        numero de libros de esa epoca
        """
        num_books__word_cat = 0
        books_period = self.manager.session.query(Book).filter_by(period=period).all()
        for book in books_period:
            #el libro contiene la palabra
            book_word = self.manager.session.query(WordCount).filter_by(
                book=book,word=word).all()
            word_category = self.manager.session.query(WordCategory).filter_by(
                category=category,word=word).one()
            
            #if len(book_word)==0, no relation then prob 0 
            if len(book_word) > 0 and word_category:
                if book_word[0].rate >= word_category.min_range and book_word[0].rate < word_category.max_range:
                    num_books__word_cat += 1
        if log:
            logger.debug("      num_books__word_cat= %f" % (num_books__word_cat))

        return num_books__word_cat

    def probability(self, word, category, period, log=False):
        """
        probabilidad esa palabra en esa categoria en esa epoca
        ---
        probabilidad de esa epoca = # libros de esa epoca / cantidad de libros
        """
        word_category_period_probability = self.word_category_period_probability(word, category, period, log=log)
        period_probability = self.period_probability(period, log=log)
        if log:
            logger.debug("  word cat period prob = %f / period prob = %f = %f" % (word_category_period_probability,period_probability,word_category_period_probability/period_probability))
        return word_category_period_probability/period_probability


    def conditional_probability(self):
        """
        """
        self.manager.session.query(WordConditionalProbability).delete()
        bulk = []
        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        categories = self.manager.session.query(Category).all()
        for period in periods:
            logger.debug(period.name)
            for category in categories:
                logger.debug(category.description)
                total = len(words_all)
                for word in words_all:
                    #word rate?
                    prob = self.probability(
                        word=word,
                        category=category,
                        period=period)
                    if prob > 1:
                        logger.debug("word %s category %s  period %s prob %s" % (word.text,category.description, period.name, prob))
                        self.probability(word=word,category=category,period=period, log=True)
                    word_cond_prob = WordConditionalProbability(
                        word=word,
                        category=category,
                        period=period,
                        probability=prob)
                    bulk.append(word_cond_prob)
                    total -= 1
                    if total % 500 == 0:
                        logger.debug("left ... %s words" % total)
        self.manager.session.add_all(bulk)
        self.manager.session.commit()
        self.complete_probability()

    def complete_probability(self):
        bulk = []
        list_cat = ['med','high','high_high']
        cats_ids = self.manager.session.query(Category).filter(Category.description.in_(list_cat)).all()
        low = self.manager.session.query(Category).filter(Category.description=='low').one()

        words_all = self.manager.session.query(Word).all()
        periods = self.manager.session.query(Period).all()
        for period in periods:
            total = len(words_all)
            for word in words_all:
                sum_3cat = self.manager.session.query(
                    func.sum(WordConditionalProbability.probability)).filter(
                        and_(WordConditionalProbability.id_category.in_(c.id for c in cats_ids),
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period == period.id)
                    ).all()[0][0]
                cat_low = self.manager.session.query(WordConditionalProbability).filter(
                        and_(WordConditionalProbability.id_category == low.id,
                            WordConditionalProbability.id_word == word.id,
                            WordConditionalProbability.id_period == period.id)
                    ).all()
                cat_low[0].probability = 1 - sum_3cat
                # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat)
                total -= 1
                if total % 500 == 0:
                    logger.debug("left ... %s words" % total)
        self.manager.session.commit()
Example #36
0
def main():
    global urlbase, acceptedNamespaces
    global expand_templates, templateCache

    parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument("input", help="XML wiki dump file")
    groupO = parser.add_argument_group('Output')
    groupO.add_argument(
        "-o",
        "--output",
        default="text",
        help="directory for extracted files (or '-' for dumping to stdout)")
    groupO.add_argument(
        "-b",
        "--bytes",
        default="1M",
        help="maximum bytes per output file (default %(default)s)",
        metavar="n[KMG]")
    groupO.add_argument("-c",
                        "--compress",
                        action="store_true",
                        help="compress output files using bzip")
    groupO.add_argument(
        "--json",
        action="store_true",
        help="write output in json format instead of the default <doc> format")

    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("--html",
                        action="store_true",
                        help="produce HTML output, subsumes --links")
    groupP.add_argument("-l",
                        "--links",
                        action="store_true",
                        help="preserve links")
    groupP.add_argument("-ns",
                        "--namespaces",
                        default="",
                        metavar="ns1,ns2",
                        help="accepted namespaces")
    groupP.add_argument("--templates",
                        help="use or create file containing templates")
    groupP.add_argument("--no-templates",
                        action="store_false",
                        help="Do not expand templates")
    groupP.add_argument(
        "--html-safe",
        default=True,
        help="use to produce HTML safe output within <doc>...</doc>")
    default_process_count = cpu_count() - 1
    parser.add_argument(
        "--processes",
        type=int,
        default=default_process_count,
        help="Number of processes to use (default %(default)s)")

    groupS = parser.add_argument_group('Special')
    groupS.add_argument("-q",
                        "--quiet",
                        action="store_true",
                        help="suppress reporting progress info")
    groupS.add_argument("--debug",
                        action="store_true",
                        help="print debug info")
    groupS.add_argument(
        "-a",
        "--article",
        action="store_true",
        help="analyze a file containing a single article (debug option)")
    groupS.add_argument("-v",
                        "--version",
                        action="version",
                        version='%(prog)s ' + __version__,
                        help="print program version")

    args = parser.parse_args()

    Extractor.keepLinks = args.links
    Extractor.HtmlFormatting = args.html
    if args.html:
        Extractor.keepLinks = True
    Extractor.to_json = args.json

    expand_templates = args.no_templates

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
        file_size = int(args.bytes[:-1]) * 1024**power
        if file_size < minFileSize:
            raise ValueError()
    except ValueError:
        logging.error('Insufficient or invalid size: %s', args.bytes)
        return

    if args.namespaces:
        acceptedNamespaces = set(args.namespaces.split(','))

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

    logger = logging.getLogger()
    if not args.quiet:
        logger.setLevel(logging.INFO)
    if args.debug:
        logger.setLevel(logging.DEBUG)

    input_file = args.input

    if not Extractor.keepLinks:
        ignoreTag('a')

    # sharing cache of parser templates is too slow:
    # manager = Manager()
    # templateCache = manager.dict()

    if args.article:
        if args.templates:
            if os.path.exists(args.templates):
                with open(args.templates) as file:
                    load_templates(file)

        with open(input_file) as file:
            page = file.read()
            ids = re.findall(r'<id>(\d*?)</id>', page)
            id = ids[0] if ids else ''
            revid = ids[1] if len(ids) > 1 else ''
            m = re.search(r'<title>(.*?)</title>', page)
            if m:
                title = m.group(1)
            else:
                logging.error('Missing title element')
                return
            m = re.search(r'<base>(.*?)</base>', page)
            if m:
                base = m.group(1)
                urlbase = base[:base.rfind("/")]
            else:
                urlbase = ''
            Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout)
        return

    output_path = args.output
    if output_path != '-' and not os.path.isdir(output_path):
        try:
            os.makedirs(output_path)
        except:
            logging.error('Could not create: %s', output_path)
            return

    process_dump(input_file, args.templates, output_path, file_size,
                 args.compress, args.processes, args.html_safe)
def main(argv):
    caff_root = 'caffe'
    mypycaffe_dir = os.path.join(caff_root, 'python')

    parser = argparse.ArgumentParser()
    # Required arguments: input file path;
    parser.add_argument(
        "input_folder",
        help="HICO image folder containing 'train2015' and 'test2015'."
    )
    parser.add_argument(
        "output_folder",
        help="Folder to save output features."
    )
    parser.add_argument(
        "num_batch",
        type=int,
        help="Number of batches."
    )
    parser.add_argument(
        "batch_id",
        type=int,
        help="Batch index."
    )
    # Optional arguments.
    parser.add_argument(
        "--chunk_size",
        default=10,
        type=int,
        help="Number of images to work on at one time."
    )
    parser.add_argument(
        "--model_def",
        default=os.path.join(mypycaffe_dir,
                "../models/bvlc_reference_caffenet/deploy.prototxt"),
        help="Model definition file."
    )
    parser.add_argument(
        "--pretrained_model",
        default=os.path.join(mypycaffe_dir,
                "../models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel"),
        help="Trained model weights file."
    )
    parser.add_argument(
        "--gpu",
        action='store_true',
        help="Switch for gpu computation."
    )
    # parser.add_argument(
    #     "--center_only",
    #     action='store_true',
    #     help="Switch for prediction from center crop alone instead of " +
    #          "averaging predictions across crops (default)."
    # )
    parser.add_argument(
        "--images_dim",
        default='256,256',
        help="Canonical 'height,width' dimensions of input images."
    )
    parser.add_argument(
        "--mean_file",
        default=os.path.join(mypycaffe_dir,
                             'caffe/imagenet/ilsvrc_2012_mean.npy'),
        help="Data set image mean of H x W x K dimensions (numpy array). " +
             "Set to '' for no mean subtraction."
    )
    parser.add_argument(
        "--input_scale",
        type=float,
        help="Multiply input features by this scale to finish preprocessing."
    )
    parser.add_argument(
        "--raw_scale",
        type=float,
        default=255.0,
        help="Multiply raw input by this scale before preprocessing."
    )
    parser.add_argument(
        "--channel_swap",
        default='2,1,0',
        help="Order to permute input channels. The default converts " +
             "RGB -> BGR since BGR is the Caffe default by way of OpenCV."
    )
    parser.add_argument(
        "--ext",
        default='jpg',
        help="Image file extension to take as input when a directory " +
             "is given as the input file."
    )
    # new arguments
    parser.add_argument(
        "--crop_mode",
        default='oversample',
        help="Set the mode for cropping input images."
    )

    args = parser.parse_args()

    image_dims = [int(s) for s in args.images_dim.split(',')]

    mean, channel_swap = None, None
    if args.mean_file:
        # mean = np.load(args.mean_file).mean(1).mean(1)
        if len(args.mean_file) > 8 and args.mean_file[:8] == 'setmean-':
            if args.mean_file[8:] == 'VGG16':
                mean = np.array([102.9801, 115.9465, 122.7717])
            # Add more cases here.
        else:
            mean = np.load(args.mean_file).mean(1).mean(1)
    if args.channel_swap:
        channel_swap = [int(s) for s in args.channel_swap.split(',')]

    net = Extractor(args.model_def, args.pretrained_model,
                    image_dims=image_dims, mean=mean,
                    input_scale=args.input_scale, 
                    raw_scale=args.raw_scale,
                    channel_swap=channel_swap,
                    feature_name="fc7")

    if args.gpu:
        caffe.set_mode_gpu()
        print("GPU mode")
    else:
        caffe.set_mode_cpu()
        print("CPU mode")

    # get list_src and list_des
    list_src, list_des = get_process_file( \
        args.input_folder, args.output_folder, args.num_batch, args.batch_id)

    # get chunk size
    chunk_size = args.chunk_size 

    # start extract
    cnt = 0
    current = 0
    chunk_src = []
    chunk_des = []
    for src, des in zip(list_src, list_des):
        # skip if output file exists
        try:
            garbage = io.loadmat(des)
            cnt += 1
            print '{:05d}/{:05d} {}'.format(cnt,len(list_src), \
                                            os.path.basename(src))
            continue
        except:
            # print dest
            pass

        # start batch
        if current == 0:
           print 'start chunk'

        # update cnt and current
        cnt += 1
        current += 1
        chunk_src.append(src)
        chunk_des.append(des)
        print '{:05d}/{:05d} {}'.format(cnt,len(list_src), \
                                        os.path.basename(src))

        # process batch
        if current == chunk_size or cnt == len(list_src):
            # load image
            try:
                inputs = [caffe.io.load_image(img_f) for img_f in chunk_src]
            except IOError as e:
                print "I/O error: " + str(e)
                current = 0
                chunk_src = []
                chunk_des = []
                continue
            except ValueError as e:
                print "value error: " + str(e)
                current = 0
                chunk_src = []
                chunk_des = []
                continue

            # extract feature
            # features = net.extract(inputs)
            features = net.extract(inputs, args.crop_mode)

            # save feature
            for index, feature in enumerate(features):
                io.savemat(chunk_des[index], {'feat': feature})

            # reset
            current = 0
            chunk_src = []
            chunk_des = []

            print "chunk done: processed {} images.".format(cnt)

    print 'done.'