def merge(): fid = open(CookieExtractor.raw_cache_name, 'r') cookie_tuple = Extractor.read_line(fid) cookie_info_dict = dict() while cookie_tuple is not None: # 统计该 IP 的所有点击的 cookie 以及其数量 for ip in cookie_tuple: cookie = cookie_tuple[ip][CookieExtractor.cookie_index] if ip in cookie_info_dict.keys(): cookie_info = cookie_info_dict[ip] cookie_num = cookie_info[CookieExtractor.cookie_num_index] if cookie in cookie_num.kyes(): cookie_num[cookie] += 1 else: cookie_num[cookie] = 1 else: cookie_info = list() cookie_num = dict() cookie_info.append(cookie_num) cookie_num[cookie] = 1 cookie_info_dict[ip] = cookie_info # next line cookie_tuple = Extractor.read_line(fid) fid = open(CookieExtractor.cache_name, 'w') fid.writelines(json.dumps(cookie_info_dict))
def merge(): fid = open(TagExtractor.raw_cache_name, 'r') tag_tuple = Extractor.read_line(fid) tag_info_dict = dict() while tag_tuple is not None: for ip in tag_tuple: tag = tag_tuple[ip][TagExtractor.tag_index] if ip in tag_info_dict.keys(): tag_info = tag_info_dict[ip] tag_num_info = tag_info[TagExtractor.tag_num_info_index] if tag in tag_num_info.keys(): tag_num_info[tag] += 1 else: tag_num_info[tag] = 1 else: tag_info = list() tag_num_info = dict() tag_info.append(tag_num_info) tag_num_info[tag] = 1 tag_info_dict[ip] = tag_info tag_tuple = Extractor.read_line(fid) fid = open(TagExtractor.cache_name, 'w') fid.writelines(json.dumps(tag_info_dict))
def clean_markup(markup, keep_links=False, ignore_headers=True): """ Clean Wikimarkup to produce plaintext. :param keep_links: Set to True to keep internal and external links :param ignore_headers: if set to True, the output list will not contain headers, only Returns a list of paragraphs (unicode strings). """ keyword_filter = [ '<span style="background', 'Chào mừng bạn', 'Xin chào bạn', 'Mời bạn tham khảo', 'Để chứng tỏ được độ nổi bật của đề tài', 'Bạn cũng có thể đóng góp', 'Khi thảo luận, bạn', 'Còn thắc mắc? Ghé', '## ', ' Hoan nghênh tham gia Wikipedia', 'Wiki chính thức', 'Tính năng:', ' <font color', 'Tiêu chuẩn bài viết' ] if not keep_links: ignoreTag('a') extractor = Extractor(0, '', []) # returns a list of strings paragraphs = extractor.clean_text(markup, mark_headers=True, expand_templates=False, escape_doc=True) resetIgnoredTags() if ignore_headers: for k in keyword_filter: paragraphs = list(filter(lambda s: not s.startswith(k), paragraphs)) return paragraphs
def extract(line): model = RawData(line) key = model.ip fields = list() fields.append(model.placement_id) Extractor.solid(key, fields, PlacementExtractor.fid)
def merge(): fid = open(DevExtractor.raw_cache_name, 'r') dev_tuple = Extractor.read_line(fid) dev_info_dict = dict() while dev_tuple is not None: for ip in dev_tuple: dev = dev_tuple[ip][DevExtractor.dev_id_index] if ip in dev_info_dict.keys(): dev_info = dev_info_dict[ip] dev_num_info = dev_info[DevExtractor.dev_num_info_index] if dev in dev_num_info.keys(): dev_num_info[dev] += 1 else: dev_num_info[dev] = 1 else: dev_info = list() dev_num_info = dict() dev_info.append(dev_num_info) dev_num_info[dev] = 1 dev_info_dict[ip] = dev_info # next line dev_tuple = Extractor.read_line(fid) fid = open(DevExtractor.cache_name, 'w') fid.writelines(json.dumps(dev_info_dict))
def extract(line): model = RawData(line) key = model.ip fields = list() fields.append(model.user_agent) Extractor.solid(key, fields, UAExtractor.fid)
def __init__(self, json_path, text_dir, db_url): self.json_path = json_path self.text_dir = text_dir self.db_url = db_url if not isdir(self.text_dir): mkdir(self.text_dir) self.extractor = Extractor(text_dir) self.manager = Manager(db_url)
def extract(line): model = RawData(line) key = model.ip fields = list() # cookie fields.append(model.cookie) Extractor.solid(key, fields, CookieExtractor.fid)
def extract(line): model = RawData(line) key = model.ip # SECOND_TAG fields = list() fields.append(Context.media_info_table[key][2]) Extractor.solid(key, fields, TagExtractor.fid)
def __init__(self): self.__data = Extractor().extract_csv_data() self.__loader = Loader() # save all extracted DataFrames from csv files to parquet files for k, v in self.__data.items(): self.__loader.save_to_parquet(k, v) # reads all saved parquet files data_files = self.__loader.read_parquets("weather") # combines all DataFrames into one to get the highest temp from all records self.__df = pd.concat(data_files, ignore_index=True)
def extract(line): model = RawData(line) key = model.ip # DEVICE ID fields = list() fields.append(model.mobile_imei + model.mobile_idfa + model.mobile_android_id + model.mobile_mac) Extractor.solid(key, fields, DevExtractor.fid)
def __init__(self, text_dir, db_url, book_url, should_download=False): """ ``text_dir`` is the directory where a copy of text should be put. ``db_url`` should be the url to a database that already exists. ``should_download`` indicates whether or not ``book_url`` is a local path or a url in the internet. """ self.text_dir = text_dir self.db_url = db_url self.book_url = book_url self.should_download = should_download self.manager = Manager(db_url) self.extractor = Extractor(text_dir)
def classify(k, classifier_path, image): im = Image.open(image) e = Extractor(im) start = datetime.datetime.now() print '#' * 37 + ' DATA ' + '#' * 37 for s_im in e: if (Extractor.is_whitespace(s_im)): print s_im, else: classify_digit(k, classifier_path, s_im) print '#' * 80 end = datetime.datetime.now() print '### TIME: {} sec###'.format((end - start).total_seconds())
class Transformer: def __init__(self): self.__data = Extractor().extract_csv_data() self.__loader = Loader() # save all extracted DataFrames from csv files to parquet files for k, v in self.__data.items(): self.__loader.save_to_parquet(k, v) # reads all saved parquet files data_files = self.__loader.read_parquets("weather") # combines all DataFrames into one to get the highest temp from all records self.__df = pd.concat(data_files, ignore_index=True) def find_hottest_day(self): """ Gets a subset of the combined data containing only the columns we need. Then finds the row that equals to the maximum ScreenTemperature and returns it. :return: a DataFrame row containing the result of the query. """ # creates a subset of the data with only the columns we need df_subset = self.__df[[ 'ObservationDate', 'ScreenTemperature', 'Region' ]] # find the row with max temperature return df_subset[df_subset['ScreenTemperature'] == df_subset['ScreenTemperature'].max()]
def merge(): fid = open(PlacementExtractor.raw_cache_name, 'r') placement_tuple = Extractor.read_line(fid) placement_info_dict = dict() while placement_tuple is not None: for ip in placement_tuple: if placement_info_dict.has_key(ip): placement_info = placement_info_dict[ip] placement_num = placement_info[ PlacementExtractor.placement_num_index] if placement_num.has_key(ip): placement_num[ip] += 1 else: placement_num = 1 else: placement_info = list() placement_num = dict() placement_info.append(placement_num) placement_num[tuple[ip]] = 1 placement_info_dict[ip] = placement_info fid = open(PlacementExtractor.cache_name, 'w') fid.writelines(json.dumps(placement_info_dict))
async def extract(request): """ Returns the text contained in the image. """ body = request.files['file'][0].body text = Extractor.extract(body) return response.text(text)
class TestExtractor(TestCase): def setUp(self): self.extractor = Extractor() def test_extract_csv_data(self): result = self.extractor.extract_csv_data() self.assertCountEqual(['weather.20160201.csv', 'weather.20160301.csv'], result.keys())
def main(inputdirs, opts=None, **extraOpts): information = None if not opts and extraOpts: # extraOpts is only here for convenience if you want to specify options as keyword arguements # It requires you haven't specified opts opts = extraOpts if opts and len(inputdirs) > 0: extractor = Extractor( inputdirs , infoKls = Everything , tempDir = opts.get("tempDir") , stateKls = State , parserKls = Parser , extension = opts.get("extension", "js") ) information = extractor.generate() information[PROJECT] = opts.get("project", None) information[VERSION] = opts.get("version", None) information[PROJECT_URL] = opts.get("projectUrl", None) information[COPYRIGHT_TAG] = opts.get("copyrightTag", None) templatedirs = opts.get("templateDirs", []) defaultTemplates = os.path.join(here, "templates") if defaultTemplates not in templatedirs: templatedirs.insert(0, defaultTemplates) gen = Generator( tempdir = opts.get("tempDir") , outDir = opts.get("outDir") , assetDirs = opts.get("assetDirs", None) , showPrivate = opts.get("showPrivate", True) , templateDirs = templatedirs ) gen.process(information) else: optparser.error("Incorrect number of arguments") return information
def main(inputdirs, opts=None, **extraOpts): information = None if not opts and extraOpts: # extraOpts is only here for convenience if you want to specify options as keyword arguements # It requires you haven't specified opts opts = extraOpts if opts and len(inputdirs) > 0: extractor = Extractor(inputdirs, infoKls=Everything, tempDir=opts.get("tempDir"), stateKls=State, parserKls=Parser, extension=opts.get("extension", "js")) information = extractor.generate() information[PROJECT] = opts.get("project", None) information[VERSION] = opts.get("version", None) information[PROJECT_URL] = opts.get("projectUrl", None) information[COPYRIGHT_TAG] = opts.get("copyrightTag", None) templatedirs = opts.get("templateDirs", []) defaultTemplates = os.path.join(here, "templates") if defaultTemplates not in templatedirs: templatedirs.insert(0, defaultTemplates) gen = Generator(tempdir=opts.get("tempDir"), outDir=opts.get("outDir"), assetDirs=opts.get("assetDirs", None), showPrivate=opts.get("showPrivate", True), templateDirs=templatedirs) gen.process(information) else: optparser.error("Incorrect number of arguments") return information
def train(classifier_path, image_path): """Train the classifier with an image. This method extract digits from the image, then prompts the user to classify the image. That classification is stored in the following format: Digit: [classification] File: [image taken from] [PIXEL DATA] @ Args: classifier_path (String): path to the file holding the classifier data image_path (String): path to the image to train on """ # open the classifier file with open(classifier_path, "a+") as f: # open the image im_name = os.path.basename(image_path) im = Image.open(image_path) # create an Extractor e = Extractor(im) # iterate over the SimpleImages extracted for s_im in e: # chack that it isn't whitespace if (not Extractor.is_whitespace(s_im)): # print the image and have the user classify it print s_im digit = input("Input value ('s' to skip): ") # skip this digit if (digit == 's'): continue # write the data to the file f.write(CLASSIFIER_FORMAT.format(digit, im_name, str(s_im)))
def update(self, rsession): url = self.config['url'] page_limit = int(self.config['page_limit']) page_count = 1 posting_urls = [] e = Extractor(self.config['fields']) while None != url: # download listing r = rsession.get(url) for c in r.cookies: rsession.cookies.set_cookie(c) clean = UnicodeDammit(r.content) content = clean.unicode_markup.encode('utf-8') print 'DL: {0}: {1} [{2}], {3} bytes'.format(url, r.status_code, clean.original_encoding, len(content)) # pull information from html result = e.extract(content) # get url of next page url = result['next_page_url'] if None == url: break # make sure the url is absolute if 'http' not in url: url = self.config['base_url'] + url # get all new posting urls new_urls = 0 for posting_url in result['posting_url']: if 'http' not in posting_url: posting_url = self.config['base_url'] + posting_url; # save any previously unseen urls if not Url.query.filter(Url.value == posting_url).count() > 0: new_urls += 1 posting_urls.append(Url(value=posting_url, status='listed')) session.commit() page_count += 1 # break if page limit reached, or page full of known urls if 0 == new_urls or page_count > page_limit: break print '{0} new urls found'.format(new_urls)
def extract_process(jobs_queue, output_queue): """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text :param jobs_queue: where to get jobs. :param output_queue: where to queue extracted text for output. """ while True: job = jobs_queue.get() # job is (id, title, page, ordinal) if job: out = BytesIO() # memory buffer Extractor(*job[:3]).extract(out) # (id, title, page) text = out.getvalue() output_queue.put((job[3], text)) # (ordinal, extracted_text) out.close() else: break
def extract_process(jobs_queue, output_queue, html_safe): """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text :param jobs_queue: where to get jobs. :param output_queue: where to queue extracted text for output. :html_safe: whether to convert entities in text to HTML. """ while True: job = jobs_queue.get() # job is (id, revid, urlbase, title, page, ordinal) if job: out = StringIO() # memory buffer Extractor(*job[:-1]).extract(out, html_safe) # (id, urlbase, title, page) text = out.getvalue() output_queue.put((job[-1], text)) # (ordinal, extracted_text) out.close() else: break
def main(): base = 'https://www.owler.com' # path = '/sector/industrial-machinery-equipment-companies' # path = '/industry/industrial-goods-services-companies' path = '/industry/industrial-goods-services-companies?p=1319' url = base + path driver = webdriver.Firefox() time.sleep(7) wait = WebDriverWait(driver, 20) driver.get(url) time.sleep(10) # with open('sample.txt', 'r') as f: # Mocked # sHtml = f.read() # Mocked resultsInfo = Extractor(driver.page_source) sdf = resultsInfo.getData() # writeData(sdf, 'biggertest') # Mocked writeData(sdf, 'companies') n = resultsInfo.nResults() print(n, 'this is main N') for i in range(5, 0, -1): time.sleep(1) print('%s seconds - Crawl will begin' % (i)) # for v in range(2, (int(n/15)+1)): for v in range(1320, (int(n / 15) + 1)): randomPause = random.randint(8, 13) for i in range(randomPause, 0, -1): time.sleep(1) # print('%s seconds - Next page will begin' % (i)) wait = WebDriverWait(driver, 20) wait.until( EC.visibility_of_element_located((By.XPATH, '//*[@id="next-15"]'))) driver.find_element_by_xpath('//*[@id="next-15"]').click() html = driver.page_source info = Extractor(html) df = info.getData() # writeData(df, 'biggertest') # Mocked writeData(df, 'companies') print('Page %s of %s' % (v, int(n / 15))) if info.title() == 'Pardon Our Interruption': print('wait: %s, p: %s of %s' % (randomPause, v, str(int(n / 15) + 1))) print(datetime.datetime.now()) driver.quit() raise SystemExit('They\'re onto us! Ghost out!') driver.quit()
def main(): if 'linux' in sys.platform: # start xvfb in case no X is running. Make sure xvfb # is installed, otherwise this won't work! dryscrape.start_xvfb() # sPage = requests.get(startUrl) # sHtml = sPage.text # sPage.raise_for_status() sess = dryscrape.Session(base_url='https://www.owler.com') sess.set_attribute('auto_load_images', False) sess.visit('/sector/industrial-machinery-equipment-companies') print(sess.status_code(), sess.headers()) sHtml = sess.body() # with open('sample.txt', 'r') as f: # Mocked # sHtml = f.read() # Mocked resultsInfo = Extractor(sHtml) sdf = resultsInfo.getData() print(type(sdf)) # writeData(sdf, 'companies') writeData(sdf, 'runcompanies') # Mocked n = resultsInfo.nResults() for i in range(5, 0, -1): time.sleep(1) print('%s seconds - Next page will begin' % (i)) for v in range(2, int(n / 15)): nextone = '/sector/industrial-machinery-equipment-companies?p=%s' % (v) print(nextone) # page = requests.get(nextpage) # page.raise_for_status() # html = page.text sess.visit(nextone) print(sess.status_code(), sess.headers()) html = sess.body() info = Extractor(html) # info = Extractor(sHtml) # Mocked df = info.getData() # writeData(df, 'companies') writeData(df, 'runcompanies') # Mocked for i in range(20, 0, -1): time.sleep(1) print('%s seconds - Next page will begin' % (i))
def main(): args = parser.parse_args() extractor = Extractor() extractor.top = args.top extractor.bottom = args.bottom extractor.left = args.left extractor.right = args.right file_name = args.video_file video = cv2.VideoCapture(file_name) fps = video.get(cv2.CAP_PROP_FPS) length = video.get(cv2.CAP_PROP_FRAME_COUNT) interval_sec = INTERVAL / fps success = True index = 0 characters = 0 last_text = '' last_sec = 0 while success: success, frame = video.read() total_sec = index / fps if index % INTERVAL == 0 and total_sec >= last_sec: text = extractor.extract(frame) characters += len(text) if len(text) == 0: dur = total_sec - last_sec output_subtitle(last_sec, last_text, dur) else: if similarity(text, last_text) > 0.6: dur = interval_sec / 2 else: dur = len(text) * TIME_PER_CHAR output_subtitle(last_sec, text, dur) msg("%d%% Processed, %d Characters Scanned" % (floor( (index / length) * 100), characters)) last_sec = last_sec + dur last_text = text index += 1
def merge(): fid = open(UAExtractor.raw_cache_name, 'r') tuple = Extractor.read_line(fid) ua_info_dict = dict() while tuple != None: for ip in tuple: if ua_info_dict.has_key(ip): ua_info = ua_info_dict[ip] ua_num = ua_info[UAExtractor.ua_num_index] if ua_num.has_key(ip): ua_num[ip] += 1 else: ua_num = 1 else: ua_info = list() ua_num = dict() ua_info.append(ua_num) ua_num[tuple[ip]] = 1 ua_info_dict[ip] = ua_info fid = open(UAExtractor.cache_name, 'w') fid.writelines(json.dumps(ua_info_dict))
def get_dev_num_info(ip): dev_info_dict = Extractor.read_cache(DevExtractor.cache_name) dev_num_info = dev_info_dict[ip][DevExtractor.dev_num_info_index] return dev_num_info
class Query(object): def __init__(self, text_dir, db_url, book_url, should_download=False): """ ``text_dir`` is the directory where a copy of text should be put. ``db_url`` should be the url to a database that already exists. ``should_download`` indicates whether or not ``book_url`` is a local path or a url in the internet. """ self.text_dir = text_dir self.db_url = db_url self.book_url = book_url self.should_download = should_download self.manager = Manager(db_url) self.extractor = Extractor(text_dir) def __enter__(self): self.run() return self def __exit__(self, type, value, traceback): self.clean_up() def run(self): word_rates = self._word_rates() word_categories = self._word_categories(word_rates) wcp = self._word_conditional_probabilities(word_categories) e, r = self._probabilities(wcp) self.elizabethan_factor = e self.romantic_factor = r def results(self): """ Returns a tuple (e, r) with the factor that this book be Elizabethan or Romantic respectively. """ return self.elizabethan_factor, self.romantic_factor def clean_up(self): if self.should_download: os.remove(self.filename) def _word_rates(self): """ Downloads the book if needed, or makes a copy of it. Returns a dictionary of words and their rates. """ if self.should_download: self.filename = self.extractor.download_book(self.book_url, True) else: self.filename = self.book_url word_rates = self.extractor.read_text(self.filename) self.word_rates = word_rates return word_rates def _word_categories(self, word_rates): """ For every word in the database returns a dictionary of word->category according to the rates in the books. Returns an iterable of WordCategory for the category of every word that is both in the book and the database, returns the WordCategory with lowest category for words in the database that did not appear in the book. """ total_words = reduce(lambda x, y: x + y, word_rates.itervalues()) rates = {word: (float(count) / total_words) for word, count in word_rates.iteritems()} words_not_in_book = self.manager.session.query(Word.text).all() words_not_in_book = set(words_not_in_book) - set(rates.keys()) words_not_in_book = list(words_not_in_book) low = self.manager.session.query(Category).\ filter(Category.description == 'low').one() word_count_query = self.manager.session.query(WordCategory) for lst in dict_key_slice(rates, MAX_SLICE_SIZE): words = self.manager.session.query(Word).\ filter(Word.text.in_(lst)).all() for word in words: rate = rates.get(word.text) word_count = word_count_query.filter(WordCategory.id_word == word.id).\ filter(WordCategory.min_range <= rate).\ filter(WordCategory.max_range > rate).one() yield word_count for lst in list_slices(map(lambda i: i[0], words_not_in_book), MAX_SLICE_SIZE): word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\ filter(WordCategory.id_category == low.id).all() for word_count in word_count_data: yield word_count def _word_conditional_probability(self, word_id, category_id, period_id): """ Returns an instace of WordConditionalProbability. """ p = self.manager.session.query(WordConditionalProbability) p = p.filter_by(id_word=word_id, id_category=category_id, id_period=period_id) p = p.one() return p def _word_conditional_probabilities(self, word_categories): """ Receives an iterable of WordCategory objects. Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the probabilities that the word and category be in Elizabethan and Romantic periods respectively. """ elizabethan = self.manager.elizabethan_period romantic = self.manager.romantic_period for wc in word_categories: word_id = wc.id_word category_id = wc.id_category e = self._word_conditional_probability(word_id, category_id, elizabethan.id).probability r = self._word_conditional_probability(word_id, category_id, romantic.id).probability yield e, r def _probabilities(self, conditional_probabilities): """ Receives an iterable as returned by ``_word_conditional_probabilities``. Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan or Romantic respectively. """ elizabethan_book_count = self.manager.elizabethan_book_count romantic_book_count = self.manager.romantic_book_count total_books = elizabethan_book_count + romantic_book_count elizabethan_probability = float(elizabethan_book_count) / total_books romantic_probability = float(romantic_book_count) / total_books elizabethan_factor = elizabethan_probability romantic_factor = romantic_probability x = 0 for e, r in conditional_probabilities: if e != 0 and r != 0: # elizabethan_factor *= 10 * e * elizabethan_probability # romantic_factor *= 10 * r * romantic_probability if e < 0.1 or r < 0.1: elizabethan_factor *= 100 * e romantic_factor *= 100 * r else: elizabethan_factor *= e romantic_factor *= r if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float('Inf') or romantic_factor == float('Inf'): return buffer_elizabethan, buffer_romantic buffer_elizabethan = elizabethan_factor buffer_romantic = romantic_factor # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) ) return elizabethan_factor, romantic_factor def top(self, count): ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1]) return ordered[0:count]
class Query(object): def __init__(self, text_dir, db_url, book_url, should_download=False): """ ``text_dir`` is the directory where a copy of text should be put. ``db_url`` should be the url to a database that already exists. ``should_download`` indicates whether or not ``book_url`` is a local path or a url in the internet. """ self.text_dir = text_dir self.db_url = db_url self.book_url = book_url self.should_download = should_download self.manager = Manager(db_url) self.extractor = Extractor(text_dir) def __enter__(self): self.run() return self def __exit__(self, type, value, traceback): self.clean_up() def run(self): word_rates = self._word_rates() word_categories = self._word_categories(word_rates) wcp = self._word_conditional_probabilities(word_categories) e, r = self._probabilities(wcp) self.elizabethan_factor = e self.romantic_factor = r def results(self): """ Returns a tuple (e, r) with the factor that this book be Elizabethan or Romantic respectively. """ return self.elizabethan_factor, self.romantic_factor def clean_up(self): if self.should_download: os.remove(self.filename) def _word_rates(self): """ Downloads the book if needed, or makes a copy of it. Returns a dictionary of words and their rates. """ if self.should_download: self.filename = self.extractor.download_book(self.book_url, True) else: self.filename = self.book_url word_rates = self.extractor.read_text(self.filename) self.word_rates = word_rates return word_rates def _word_categories(self, word_rates): """ For every word in the database returns a dictionary of word->category according to the rates in the books. Returns an iterable of WordCategory for the category of every word that is both in the book and the database, returns the WordCategory with lowest category for words in the database that did not appear in the book. """ total_words = reduce(lambda x, y: x + y, word_rates.itervalues()) rates = { word: (float(count) / total_words) for word, count in word_rates.iteritems() } words_not_in_book = self.manager.session.query(Word.text).all() words_not_in_book = set(words_not_in_book) - set(rates.keys()) words_not_in_book = list(words_not_in_book) low = self.manager.session.query(Category).\ filter(Category.description == 'low').one() word_count_query = self.manager.session.query(WordCategory) for lst in dict_key_slice(rates, MAX_SLICE_SIZE): words = self.manager.session.query(Word).\ filter(Word.text.in_(lst)).all() for word in words: rate = rates.get(word.text) word_count = word_count_query.filter(WordCategory.id_word == word.id).\ filter(WordCategory.min_range <= rate).\ filter(WordCategory.max_range > rate).one() yield word_count for lst in list_slices(map(lambda i: i[0], words_not_in_book), MAX_SLICE_SIZE): word_count_data = word_count_query.filter(WordCategory.id_word.in_(lst)).\ filter(WordCategory.id_category == low.id).all() for word_count in word_count_data: yield word_count def _word_conditional_probability(self, word_id, category_id, period_id): """ Returns an instace of WordConditionalProbability. """ p = self.manager.session.query(WordConditionalProbability) p = p.filter_by(id_word=word_id, id_category=category_id, id_period=period_id) p = p.one() return p def _word_conditional_probabilities(self, word_categories): """ Receives an iterable of WordCategory objects. Yields a tuples of ``(e, r)`` where ``e`` and ``r`` are the probabilities that the word and category be in Elizabethan and Romantic periods respectively. """ elizabethan = self.manager.elizabethan_period romantic = self.manager.romantic_period for wc in word_categories: word_id = wc.id_word category_id = wc.id_category e = self._word_conditional_probability(word_id, category_id, elizabethan.id).probability r = self._word_conditional_probability(word_id, category_id, romantic.id).probability yield e, r def _probabilities(self, conditional_probabilities): """ Receives an iterable as returned by ``_word_conditional_probabilities``. Returns a tuple ``(e, r)`` of the factor than this book be Elizabethan or Romantic respectively. """ elizabethan_book_count = self.manager.elizabethan_book_count romantic_book_count = self.manager.romantic_book_count total_books = elizabethan_book_count + romantic_book_count elizabethan_probability = float(elizabethan_book_count) / total_books romantic_probability = float(romantic_book_count) / total_books elizabethan_factor = elizabethan_probability romantic_factor = romantic_probability x = 0 for e, r in conditional_probabilities: if e != 0 and r != 0: # elizabethan_factor *= 10 * e * elizabethan_probability # romantic_factor *= 10 * r * romantic_probability if e < 0.1 or r < 0.1: elizabethan_factor *= 100 * e romantic_factor *= 100 * r else: elizabethan_factor *= e romantic_factor *= r if elizabethan_factor == 0 or romantic_factor == 0 or elizabethan_factor == float( 'Inf') or romantic_factor == float('Inf'): return buffer_elizabethan, buffer_romantic buffer_elizabethan = elizabethan_factor buffer_romantic = romantic_factor # logger.debug( "e = %f, r = %f" % (elizabethan_factor, romantic_factor) ) return elizabethan_factor, romantic_factor def top(self, count): ordered = sorted(self.word_rates.iteritems(), key=lambda x: -x[1]) return ordered[0:count]
import json from extract import Extractor from transform import Transformer ## Modify boolean as needed, if testing directly from static JSON blobs or performing execution of SOQL queries: full_soql_query_mode = False ## SFDC API AUTH: # auth is a separate Python module as a placeholder to store SFDC creds. Ref required params as follows: # from simple_salesforce import Salesforce, SalesforceLogin # sf = Salesforce(username='******', password='******', security_token='token', client_id='Testing', \ # instance_url='https://zayo.my.salesforce.com', session_id='') if full_soql_query_mode == True: from auth import sf from load import Loader e = Extractor() opp_query = e.get_opp_info() opp_output = sf.query_all(opp_query) records_only = opp_output['records'] size = opp_output['totalSize'] formatted_opp_ids = e.format_opp_ids(opp_output, size) npv_task_query = e.get_npv_task_info(formatted_opp_ids) npv_task_output = sf.query_all(npv_task_query) service_order_query = e.get_so_info(formatted_opp_ids) service_order_output = sf.query_all(service_order_query) cap_proj_query = e.get_capital_project_info(formatted_opp_ids) cap_proj_output = sf.query_all(cap_proj_query)
contents =f.read() #print contents words = contents.split() for word in words: wordID = 0 for i,d in enumerate(dictionary): if d[0] == word: wordID = i features_matrix[0,wordID] = words.count(word) return features_matrix test_doc = 'travel-nontravel/tr2.txt' doc_matrix = extract_features_for_single_doc(test_doc) extractor = Extractor() result3 = model1.predict(doc_matrix) if result3==0: print "non travel" else: print "travel" print str(result3)+"\n" if result3==1: extractor.setPath(test_doc) user_name = extractor.findUserName()#emailid date = extractor.findDate() time = extractor.findTime() address = extractor.findAddress() print date print time print address
class Trainer(object): def __init__(self, json_path, text_dir, db_url): self.json_path = json_path self.text_dir = text_dir self.db_url = db_url if not isdir(self.text_dir): mkdir(self.text_dir) self.extractor = Extractor(text_dir) self.manager = Manager(db_url) def json(self): if not hasattr(self, "_json"): _json = [] texts = {} with open(self.json_path, "r") as f: texts = json.load(f) for text in texts: author = text["Author"] title = text["Title"] period = text["Period"] url = text["URL"] _json.append((author, title, period, url)) return _json def get_books(self): """ Downloads the book if it's not in the texts directory. """ files = [f for f in listdir(self.text_dir)] for author, title, period, url in self.json(): filename = format_filename(author, title) if not filename in files: logger.debug("Getting %s" % filename) book = self.extractor.download_book(url, False, author, title, period) else: logger.debug("%s already downloaded" % filename) def train(self): logger.debug(" STARTING get_books") self.get_books() logger.debug(" STARTING populate") self.populate() logger.debug(" STARTING categories") self.categories() logger.debug(" STARTING conditional_probability") self.conditional_probability() self.manager.session.close_all() def populate(self): output = [] for author, title, period, url in self.json(): # TODO clean the next line words = self.extractor.read_text(format_filename(author, title)) if len(words) == 0: continue total_words = reduce(operator.add, words.values()) #insert period dic_period = {'name':period} list_search = ['name'] period_obj = self.manager.get_or_insert(dict_val=dic_period, instance=models.Period, list_search=list_search) #insert book # logger.debug(words) logger.debug("Total Words: %s", total_words) dic_book = {'name':title, 'author':author, 'period':period_obj, 'total_words':total_words, 'sentence_total':0} list_search = ['name','author','period'] book_obj = self.manager.get_or_insert(dict_val=dic_book, instance=models.Book,list_search=list_search) #Words filename = format_filename(author, title) if len(words) == 0: continue logger.debug("Period id : %s %s" % (period_obj.id,period_obj.name)) logger.debug("Book id : %s %s %s" % (book_obj.id,book_obj.name,book_obj.author)) self.manager.insert_words(words,book_obj,total_words) def categories(self): words_all = self.manager.get({},Word,[],True) total = len(words_all) logger.debug(" categories Words %s" % total) for word_obj in words_all: self.calculate_categories(word_obj=word_obj) total -= 1 if total % 500 ==0: logger.debug("Progressing Word -- Category... %s" % total) self.manager.session.commit() def calculate_categories(self, word_obj=None): if not word_obj: return False max_rate, min_rate = self.manager.get_max_min_rate(word_obj) self.manager.construct_categories(min_rate,max_rate, word_obj) def period_probability(self, period, log=False): """ # libros de esa epoca --- # total de libros """ books_period = self.manager.session.query(Book).filter_by(period=period).count() if log: logger.debug(" books_period = %f " % (books_period)) return books_period def word_category_period_probability(self, word, category, period, log=False): """ cuenta cuantos (libros de esa epoca) tienen esa palabra en esa categoria --- numero de libros de esa epoca """ num_books__word_cat = 0 books_period = self.manager.session.query(Book).filter_by(period=period).all() for book in books_period: #el libro contiene la palabra book_word = self.manager.session.query(WordCount).filter_by( book=book,word=word).all() word_category = self.manager.session.query(WordCategory).filter_by( category=category,word=word).one() #if len(book_word)==0, no relation then prob 0 if len(book_word) > 0 and word_category: if book_word[0].rate >= word_category.min_range and book_word[0].rate < word_category.max_range: num_books__word_cat += 1 if log: logger.debug(" num_books__word_cat= %f" % (num_books__word_cat)) return num_books__word_cat def probability(self, word, category, period, log=False): """ probabilidad esa palabra en esa categoria en esa epoca --- probabilidad de esa epoca = # libros de esa epoca / cantidad de libros """ word_category_period_probability = self.word_category_period_probability(word, category, period, log=log) period_probability = self.period_probability(period, log=log) if log: logger.debug(" word cat period prob = %f / period prob = %f = %f" % (word_category_period_probability,period_probability,word_category_period_probability/period_probability)) return word_category_period_probability/period_probability def conditional_probability(self): """ """ self.manager.session.query(WordConditionalProbability).delete() bulk = [] words_all = self.manager.session.query(Word).all() periods = self.manager.session.query(Period).all() categories = self.manager.session.query(Category).all() for period in periods: logger.debug(period.name) for category in categories: logger.debug(category.description) total = len(words_all) for word in words_all: #word rate? prob = self.probability( word=word, category=category, period=period) if prob > 1: logger.debug("word %s category %s period %s prob %s" % (word.text,category.description, period.name, prob)) self.probability(word=word,category=category,period=period, log=True) word_cond_prob = WordConditionalProbability( word=word, category=category, period=period, probability=prob) bulk.append(word_cond_prob) total -= 1 if total % 500 == 0: logger.debug("left ... %s words" % total) self.manager.session.add_all(bulk) self.manager.session.commit() self.complete_probability() def complete_probability(self): bulk = [] list_cat = ['med','high','high_high'] cats_ids = self.manager.session.query(Category).filter(Category.description.in_(list_cat)).all() low = self.manager.session.query(Category).filter(Category.description=='low').one() words_all = self.manager.session.query(Word).all() periods = self.manager.session.query(Period).all() for period in periods: total = len(words_all) for word in words_all: sum_3cat = self.manager.session.query( func.sum(WordConditionalProbability.probability)).filter( and_(WordConditionalProbability.id_category.in_(c.id for c in cats_ids), WordConditionalProbability.id_word == word.id, WordConditionalProbability.id_period == period.id) ).all()[0][0] cat_low = self.manager.session.query(WordConditionalProbability).filter( and_(WordConditionalProbability.id_category == low.id, WordConditionalProbability.id_word == word.id, WordConditionalProbability.id_period == period.id) ).all() cat_low[0].probability = 1 - sum_3cat # print "word_id %s period %d sum %s" %(word.id,period.id,sum_3cat) total -= 1 if total % 500 == 0: logger.debug("left ... %s words" % total) self.manager.session.commit()
def main(): global urlbase, acceptedNamespaces global expand_templates, templateCache parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument("input", help="XML wiki dump file") groupO = parser.add_argument_group('Output') groupO.add_argument( "-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdout)") groupO.add_argument( "-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]") groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip") groupO.add_argument( "--json", action="store_true", help="write output in json format instead of the default <doc> format") groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", help="produce HTML output, subsumes --links") groupP.add_argument("-l", "--links", action="store_true", help="preserve links") groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", help="accepted namespaces") groupP.add_argument("--templates", help="use or create file containing templates") groupP.add_argument("--no-templates", action="store_false", help="Do not expand templates") groupP.add_argument( "--html-safe", default=True, help="use to produce HTML safe output within <doc>...</doc>") default_process_count = cpu_count() - 1 parser.add_argument( "--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)") groupS = parser.add_argument_group('Special') groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info") groupS.add_argument("--debug", action="store_true", help="print debug info") groupS.add_argument( "-a", "--article", action="store_true", help="analyze a file containing a single article (debug option)") groupS.add_argument("-v", "--version", action="version", version='%(prog)s ' + __version__, help="print program version") args = parser.parse_args() Extractor.keepLinks = args.links Extractor.HtmlFormatting = args.html if args.html: Extractor.keepLinks = True Extractor.to_json = args.json expand_templates = args.no_templates try: power = 'kmg'.find(args.bytes[-1].lower()) + 1 file_size = int(args.bytes[:-1]) * 1024**power if file_size < minFileSize: raise ValueError() except ValueError: logging.error('Insufficient or invalid size: %s', args.bytes) return if args.namespaces: acceptedNamespaces = set(args.namespaces.split(',')) FORMAT = '%(levelname)s: %(message)s' logging.basicConfig(format=FORMAT) logger = logging.getLogger() if not args.quiet: logger.setLevel(logging.INFO) if args.debug: logger.setLevel(logging.DEBUG) input_file = args.input if not Extractor.keepLinks: ignoreTag('a') # sharing cache of parser templates is too slow: # manager = Manager() # templateCache = manager.dict() if args.article: if args.templates: if os.path.exists(args.templates): with open(args.templates) as file: load_templates(file) with open(input_file) as file: page = file.read() ids = re.findall(r'<id>(\d*?)</id>', page) id = ids[0] if ids else '' revid = ids[1] if len(ids) > 1 else '' m = re.search(r'<title>(.*?)</title>', page) if m: title = m.group(1) else: logging.error('Missing title element') return m = re.search(r'<base>(.*?)</base>', page) if m: base = m.group(1) urlbase = base[:base.rfind("/")] else: urlbase = '' Extractor(id, revid, urlbase, title, [page]).extract(sys.stdout) return output_path = args.output if output_path != '-' and not os.path.isdir(output_path): try: os.makedirs(output_path) except: logging.error('Could not create: %s', output_path) return process_dump(input_file, args.templates, output_path, file_size, args.compress, args.processes, args.html_safe)
def main(argv): caff_root = 'caffe' mypycaffe_dir = os.path.join(caff_root, 'python') parser = argparse.ArgumentParser() # Required arguments: input file path; parser.add_argument( "input_folder", help="HICO image folder containing 'train2015' and 'test2015'." ) parser.add_argument( "output_folder", help="Folder to save output features." ) parser.add_argument( "num_batch", type=int, help="Number of batches." ) parser.add_argument( "batch_id", type=int, help="Batch index." ) # Optional arguments. parser.add_argument( "--chunk_size", default=10, type=int, help="Number of images to work on at one time." ) parser.add_argument( "--model_def", default=os.path.join(mypycaffe_dir, "../models/bvlc_reference_caffenet/deploy.prototxt"), help="Model definition file." ) parser.add_argument( "--pretrained_model", default=os.path.join(mypycaffe_dir, "../models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel"), help="Trained model weights file." ) parser.add_argument( "--gpu", action='store_true', help="Switch for gpu computation." ) # parser.add_argument( # "--center_only", # action='store_true', # help="Switch for prediction from center crop alone instead of " + # "averaging predictions across crops (default)." # ) parser.add_argument( "--images_dim", default='256,256', help="Canonical 'height,width' dimensions of input images." ) parser.add_argument( "--mean_file", default=os.path.join(mypycaffe_dir, 'caffe/imagenet/ilsvrc_2012_mean.npy'), help="Data set image mean of H x W x K dimensions (numpy array). " + "Set to '' for no mean subtraction." ) parser.add_argument( "--input_scale", type=float, help="Multiply input features by this scale to finish preprocessing." ) parser.add_argument( "--raw_scale", type=float, default=255.0, help="Multiply raw input by this scale before preprocessing." ) parser.add_argument( "--channel_swap", default='2,1,0', help="Order to permute input channels. The default converts " + "RGB -> BGR since BGR is the Caffe default by way of OpenCV." ) parser.add_argument( "--ext", default='jpg', help="Image file extension to take as input when a directory " + "is given as the input file." ) # new arguments parser.add_argument( "--crop_mode", default='oversample', help="Set the mode for cropping input images." ) args = parser.parse_args() image_dims = [int(s) for s in args.images_dim.split(',')] mean, channel_swap = None, None if args.mean_file: # mean = np.load(args.mean_file).mean(1).mean(1) if len(args.mean_file) > 8 and args.mean_file[:8] == 'setmean-': if args.mean_file[8:] == 'VGG16': mean = np.array([102.9801, 115.9465, 122.7717]) # Add more cases here. else: mean = np.load(args.mean_file).mean(1).mean(1) if args.channel_swap: channel_swap = [int(s) for s in args.channel_swap.split(',')] net = Extractor(args.model_def, args.pretrained_model, image_dims=image_dims, mean=mean, input_scale=args.input_scale, raw_scale=args.raw_scale, channel_swap=channel_swap, feature_name="fc7") if args.gpu: caffe.set_mode_gpu() print("GPU mode") else: caffe.set_mode_cpu() print("CPU mode") # get list_src and list_des list_src, list_des = get_process_file( \ args.input_folder, args.output_folder, args.num_batch, args.batch_id) # get chunk size chunk_size = args.chunk_size # start extract cnt = 0 current = 0 chunk_src = [] chunk_des = [] for src, des in zip(list_src, list_des): # skip if output file exists try: garbage = io.loadmat(des) cnt += 1 print '{:05d}/{:05d} {}'.format(cnt,len(list_src), \ os.path.basename(src)) continue except: # print dest pass # start batch if current == 0: print 'start chunk' # update cnt and current cnt += 1 current += 1 chunk_src.append(src) chunk_des.append(des) print '{:05d}/{:05d} {}'.format(cnt,len(list_src), \ os.path.basename(src)) # process batch if current == chunk_size or cnt == len(list_src): # load image try: inputs = [caffe.io.load_image(img_f) for img_f in chunk_src] except IOError as e: print "I/O error: " + str(e) current = 0 chunk_src = [] chunk_des = [] continue except ValueError as e: print "value error: " + str(e) current = 0 chunk_src = [] chunk_des = [] continue # extract feature # features = net.extract(inputs) features = net.extract(inputs, args.crop_mode) # save feature for index, feature in enumerate(features): io.savemat(chunk_des[index], {'feat': feature}) # reset current = 0 chunk_src = [] chunk_des = [] print "chunk done: processed {} images.".format(cnt) print 'done.'