def listImgs(AP, GTS): '''List the images that *have* been used by the AHP. In other words, the images that are *not* images of opportunity.''' from extract import mapping, extract #Select the targets that produce image numbers #Everything labelled in mapping as IMG relKeys = [key for key in mapping if key[-3:] == 'IMG'] imgs = set() for key in relKeys: dscnNum = extract(key, AP, GTS) if dscnNum: #if the image is there imgs.add(str(int(dscnNum)).zfill(4)) #Add the other images in the pan: panImg = extract("First Pan IMG", AP, GTS) if panImg: numInPan = extract('PanNum', AP, GTS) panImg = int(panImg) #convert to integer from float numInPan = int(numInPan) for number in range(1, numInPan): #The first image is already there, so don't add it again. imgs.add(str((panImg + number) % 10000).zfill(4)) #Add the second image of the stereo pair: sterImg = extract('Stereo IMG', AP, GTS) if sterImg: imgs.add(str((int(sterImg) + 1) % 10000).zfill(4)) return imgs
def checkupdates(): logging.info("Reading from file") f=open('docid.txt') lists=[] lists=map(int,f) docid=lists[0] f.close() logging.info("Reading complete.") logging.info("Starting to check for update") url= 'http://hib.iiit-bh.ac.in/Hibiscus/Pub/nbDocDet.php?docid={}&client=iiit&iframe=true&nb=Y'.format(docid) logging.info("Trying to fetch d url") resp = urllib2.urlopen(url) respData = resp.read() logging.info("Fetching complete.") regex='<h1 style="BACKGROUND-COLOR: white; line-height: 2em; margin:0 .5em .2em .5em; padding: 4px 8px 4px 8px; border-radius: 10px;-moz-border-radius: 10px; -webkit-border-radius: 10px; border: 1px solid silver;text-decoration:none; font-size: 2.1em;">(.*?)</h1>' pattern =re.compile(regex) header=re.findall(pattern,respData) logging.info("Got the header") if not header: logging.info("No new notice found") pass else: logging.info("Got a new notice") logging.info("Writing to file") docid=docid+1 f=open('docid.txt','w') f.write(str(docid)) f.close() logging.info("Writing complete.") try: logging.info("sending html to extract") extract.extract(respData,header[0],url) except Exception as e: logging.error("Calling extract failed %s",e)
def main(args): model = utils.get_models(bert_config=args.bert_config, pred_n_labels=args.pred_n_labels, arg_n_labels=args.arg_n_labels, n_arg_heads=args.n_arg_heads, n_arg_layers=args.n_arg_layers, pos_emb_dim=args.pos_emb_dim, use_lstm=args.use_lstm, device=args.device) if torch.cuda.is_available(): map_location = lambda storage, loc: storage.cuda() else: map_location = 'cpu' model.load_state_dict( torch.load(args.model_path, map_location=map_location)) model.zero_grad() model.eval() loader = load_data(data_path=args.test_data_path, batch_size=args.batch_size, tokenizer_config=args.bert_config, train=False) start = time.time() extract(args, model, loader, args.save_path) print("TIME: ", time.time() - start) test_results = do_eval(args.save_path, args.test_gold_path) utils.print_results("TEST RESULT", test_results, ["F1 ", "PREC", "REC ", "AUC "])
def parse_sharejs(url, html): kind = url.rsplit('/', 2)[1] # kind是大分类,区别tag_list html = html.decode('utf-8') # decode here title = extract('<h1>', '</h1>', extract('<div class="post_title">', '</div>', html)) post_content = extract('<div class="post_content" id="paragraph">', '<div class="hot_tags">', html) if not post_content: post_content = extract('<div class="post_content" id="paragraph">', '<div class="share">', html) post_content = re.sub(r'<span class="title">(.*?)</span>', '', post_content) content = html2markdown(post_content) try: tag_list = extract_all( '">', '</a>', extract('<div class="hot_tags">', '</div>', html)) except AttributeError: tag_list = [] data = { 'kind': kind, 'title': title, 'source_url': url, 'source': 'www.sharejs.com', 'content': content, 'tag_list': tag_list, 'read_count': 0, } return data
def parse_sharejs(url, html): kind = url.rsplit('/', 2)[1] # kind是大分类,区别tag_list html = html.decode('utf-8') # decode here title = extract('<h1>', '</h1>', extract('<div class="post_title">', '</div>', html)) post_content = extract('<div class="post_content" id="paragraph">', '<div class="hot_tags">', html) if not post_content: post_content = extract('<div class="post_content" id="paragraph">', '<div class="share">', html) post_content = re.sub(r'<span class="title">(.*?)</span>', '', post_content) content = html2markdown(post_content) try: tag_list = extract_all('">', '</a>', extract('<div class="hot_tags">', '</div>', html)) except AttributeError: tag_list = [] data = { 'kind': kind, 'title': title, 'source_url': url, 'source': 'www.sharejs.com', 'content': content, 'tag_list': tag_list, 'read_count': 0, } return data
def worker(self,video): video_path = video.split("\\") video_name = video_path[len(video_path)-1][:-4] image_path = "G:" + os.sep + video_name + os.sep + video_name csv_path = "G:\\"+video_name+".csv" print 'thread-%d work on video %s' % (self.number, video_name) #extract video #input:video #output:image_path self.logger.info(video_name+'extract') extract.extract(video,image_path) #remove duplications self.logger.info(video_name+'remove') try: duplicate_list = duplication.getdelSeq(image_path[0:18]) dirs = os.listdir(image_path[0:18]) paths = [image_path[0:18] + os.sep + dir for dir in dirs] filelist = list(set(paths).difference(set(duplicate_list))) except(TypeError): dirs = os.listdir(image_path[0:18]) paths = [image_path[0:18] + os.sep + dir for dir in dirs] filelist = paths #create threads to count mos of each image self.logger.info(video_name+'quality') image_queue = Queue.Queue() map(image_queue.put,filelist) self.logger.info('after removing:'+str(image_queue.qsize())) for i in range(15): t = quality.ThreadCounter(image_queue,csv_path) t.setDaemon(True) t.start() image_queue.join()
def process_file(filename): try: extract(os.path.join(app.config['UPLOAD_FOLDER'], filename+".zip"), os.path.join("data","delivery","000")) files = utils.get_tif_list() fname = files[0] scan = PhScan(fname) logger.info("Generating phragmites estimate...") #print("Generating phragmites estimate...") bgrn = scan.norm phrag = phrag_map(bgrn) logger.info("Generating the clusters...") #print("Generating the clusters...") clust = cluster_ph(scan, n_clusters=5, n_jobs=10, frac=0.05) ffile = os.path.join("tmp", fname.split(os.sep)[-1].replace(".TIF", "_proc.TIF")) if not os.path.isfile(ffile): logger.info("Writing processed maps to GeoTIFF {0}...".format(ffile)) #print("Writing processed maps to GeoTIFF {0}...".format(ffile)) write_tif(ffile, scan, phrag, clust) # add time to prepare files time.sleep(5) logger.info("Processing Done") #print("done") # -- decrease reference counters for arrays #del scan, bgrn, phrag, clust return render_template("process_done.html", filename=ffile.split(os.sep)[-1]) except Exception as ex: return redirect(url_for('upload_file', error="There is an error in the process_file, please try again"))
def listImgs(AP, GTS): '''List the images that *have* been used by the AHP. In other words, the images that are *not* images of opportunity.''' from extract import mapping, extract #Select the targets that produce image numbers #Everything labelled in mapping as IMG relKeys = [key for key in mapping if key[-3:] == 'IMG'] imgs = set() for key in relKeys: dscnNum = extract(key, AP, GTS) if dscnNum:#if the image is there imgs.add(str(int(dscnNum)).zfill(4)) #Add the other images in the pan: panImg = extract("First Pan IMG", AP, GTS) if panImg: numInPan = extract('PanNum', AP, GTS) panImg = int(panImg) #convert to integer from float numInPan = int(numInPan) for number in range(1, numInPan): #The first image is already there, so don't add it again. imgs.add(str((panImg + number) % 10000).zfill(4)) #Add the second image of the stereo pair: sterImg = extract('Stereo IMG', AP, GTS) if sterImg: imgs.add(str((int(sterImg) + 1) % 10000).zfill(4)) return imgs
def test_extract_function(self, mock_init, mock_cleanup, mock_write_batch, mock_construct_fn, mock_open): # # set mock_init's return_value to None, since this method is mocking # a constructor and constructor is required to return None # mock_init.return_value = None source_type = "postgres" credentials = {'dbname': 'somedb', 'user': '******'} source_config = {'table': 'sometable', 'key2': 'somevalue'} extract_location = "/some/path" extract_filename = "a_file" extract.extract(source_type, credentials, source_config, extract_location, extract_filename) # # verify call to open() # expected_filename_with_path = self.filename_constructed mock_open.assert_called_once_with(expected_filename_with_path, "w+") # # verify call to construct_function() # mock_construct_fn.assert_called_once_with(extract_location, extract_filename) # # verify calls to write_batch() # self.assertEqual(2, mock_write_batch.call_count) write_batch_calls = [(1, 2), (3, 4)] write_batch_call_list = mock_write_batch.call_args_list first_call = write_batch_call_list[0] first_call_args, first_call_kwargs = first_call first_call_args_of_interest = first_call_args[1] second_call = write_batch_call_list[1] second_call_args, second_call_kwargs = second_call second_call_args_of_interest = second_call_args[1] self.assertEqual(first_call_args_of_interest, [(1, "aaa", 1000), (2, "bbb", 2000)]) self.assertEqual(second_call_args_of_interest, [(3, "ccc", 3000), (4, "ddd", 4000)]) # # verify call to cleanup() # mock_cleanup.assert_called_once_with() # # verify class constructor called with expected arguments # mock_init.assert_called_once_with(credentials, source_config)
def extract_spectra(hdu, yc, dy, outfile, ext=1, minsize=5, thresh=3, grow=0, smooth=False, maskzeros=False, convert=True, cleanspectra=True, calfile=None, clobber=True, specformat='ascii'): """From an image, extract a spectra. """ data=hdu[ext].data #replace the zeros with the average from the frame if maskzeros: mean,std=iterstat(data[data>0]) #rdata=mean np.random.normal(mean, std, size=data.shape) data[data<=0]=mean #rdata[data<=0] y1=yc-dy y2=yc+dy ap_list=extract(hdu, method='normal', section=[(y1,y2)], minsize=minsize, thresh=thresh, convert=convert) sy1a=y2 sy2a=sy1a+2.0*dy ska_list=extract(hdu, method='normal', section=[(sy1a,sy2a)], minsize=minsize, thresh=thresh, convert=convert) sy2b=y1-dy sy1b=sy2b-2.0*dy skb_list=extract(hdu, method='normal', section=[(sy1b,sy2b)], minsize=minsize, thresh=thresh, convert=convert) print sy1b, sy2b sdata = 0.5*(ska_list[0].ldata/(sy2a-sy1a) + skb_list[0].ldata/(sy2b-sy1b)) #sdata = ska_list[0].ldata/(sy2a-sy1a) #sdata = skb_list[0].ldata/(sy2b-sy1b) raw = 1.0 * ap_list[0].ldata print 'extract:', ap_list[0].ldata[1124] ap_list[0].ldata=ap_list[0].ldata-float(y2-y1) * sdata print 'sky:', ap_list[0].ldata[1124] print ap_list[0].wave[10], ap_list[0].ldata[10], ap_list[0].lvar[10] flux_spec=Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum') print flux_spec.wavelength[10], flux_spec.flux[10], flux_spec.var[10] if cleanspectra: clean_spectra(ap_list[0], grow=grow) print 'clean:', ap_list[0].ldata[1124] if calfile: cal_spectra=st.readspectrum(calfile, error=False, ftype='ascii') airmass=hdu[0].header['AIRMASS'] exptime=hdu[0].header['EXPTIME'] extfile=os.path.dirname(st.__file__)+"/suth_extinct.dat" print extfile ext_spectra=st.readspectrum(extfile, error=False, ftype='ascii') flux_spec=Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum') print flux_spec.flux[10], flux_spec.var[10] flux_spec=calfunc(flux_spec, cal_spectra, ext_spectra, airmass, exptime, True) print flux_spec.flux[10], flux_spec.var[10] else: flux_spec = Spectrum.Spectrum(ap_list[0].wave, ap_list[0].ldata, abs(ap_list[0].lvar)**0.5, stype='continuum') if specformat == 'ascii': write_ascii(outfile, flux_spec, clobber=clobber) elif specformat == 'lcogt': write_lcogt(outfile, flux_spec, hdu, sky=float(y2-y1) * sdata, raw = raw, clobber=clobber)
def test_extract_with_by_key(self): self.assertEquals( extract.extract( 'root/section/item2', '{"root": {"section": {"item1": "value1", "item2": "value2"}}}' ), 'value2') self.assertEquals(extract.extract('a/b/c', '{"a":{"b":{"c":"d"}}}'), 'd')
def top(path, f_type, hang, lie): f_list = readfile.readfile(path, f_type) for i in f_list: extract.extract(i, hang, lie) return
def testCheckUrl(self): ''' Validates the url check incorporated in the extract function ''' urls_for_validation = ['google.com','https://flipkart.com/ayush','https://amazon.com'] with self.assertRaises(NameError) as context: for url in urls_for_validation: extract(url) self.assertEqual(context.exception.message , 'Invalid URL given')
def main(): "run main function on parsed args" # get arguments from command line as a dict-like object args = parse_command_line() pdf_path = input("Please input full pdf path and add .pdf \n") # pass argument to call darwinday function if args.run: extract(pdf_path)
def handler(event, context): """ entry point for Lambda function :param event: the Lambda event :param context: the Lambda context :return: None """ print(f"'event': {event}") print(f"'context': {context}") # ----------------------------------------------------- # EXTRACT # define ny_dataset ny_dataset = classes.Dataset("ny_dataset") ny_dataset.headers_all = ["date", "cases", "deaths"] ny_dataset.headers_key = ny_dataset.headers_all ny_dataset.match_field = "date" ny_dataset.source_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv" # extract and print ny_dataset ny_dataset.df = extract.extract(ny_dataset.source_url) print(f"'ny_dataset.df':\n{ny_dataset.df}") # define jh_dataset jh_dataset = classes.Dataset("jh_dataset") jh_dataset.headers_all = [ "Date", "Country/Region", "Province/State", "Lat", "Long", "Confirmed", "Recovered", "Deaths" ] jh_dataset.headers_key = ["Date", "Country/Region", "Recovered"] jh_dataset.match_field = "Date" jh_dataset.source_url = \ "https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv" # extract and print jh_dataset jh_dataset.df = extract.extract(jh_dataset.source_url, jh_dataset.headers_key, "Country/Region", "US") print(f"'jh_dataset.df':\n{jh_dataset.df}") # ----------------------------------------------------- # TRANSFORM # transform the datasets into CovidStat Instances covid_stats = transform.transform(ny_dataset, jh_dataset) # print CovidStats print(*covid_stats, sep="\n") # ----------------------------------------------------- # LOAD # load CovidStat instances into the CovidStats DynamoDB table load.load_all(classes.CovidStat, covid_stats) load.load_json(covid_stats)
def iqiyi_spider(self, url): ''' 爱奇艺爬虫 ''' r = requests.get(url) if r.status_code == 200: v_id = extract('data-player-tvid="', '"', r.text) url_ = 'http://mixer.video.iqiyi.com/jp/mixin/videos/{v_id}'.format(v_id=v_id) r = requests.get(url_) if r.status_code == 200: return extract('"playCount":', ',"', r.text)
def testCheckUrl(self): ''' Validates the url check incorporated in the extract function ''' urls_for_validation = [ 'google.com', 'https://flipkart.com/ayush', 'https://amazon.com' ] with self.assertRaises(NameError) as context: for url in urls_for_validation: extract(url) self.assertEqual(context.exception.message, 'Invalid URL given')
def extract_interface(): # ex. files/stego/ stego_image_dir = gvar.directory['stego'] mlib.check_dir(stego_image_dir) stego_image_folders = os.listdir(stego_image_dir) # ex. /home/.../pySTC/files/message_embed/R message_dir = gvar.directory['message_extract'] message_dir_channel = {} for i in stego_image_folders: message_dir_channel[i] = os.path.join(message_dir, i) mlib.check_dir(message_dir_channel[i]) print('In ' + str(stego_image_dir)) print('Channel list: ' + str(stego_image_folders)) print('Extract start...\n') for i in range(len(stego_image_folders)): # ex. files/stego/R stego_image_folders[i] = os.path.join(stego_image_dir, stego_image_folders[i]) stego_image_filelist = os.listdir(stego_image_folders[i]) stego_image_filelist.sort() data_size = len(stego_image_filelist) print( str(data_size) + " images to extract in " + str(stego_image_folders[i])) print("Start extracting...") for j in tqdm(range(int(data_size)), file=sys.stdout): # ex. files/stego/R\01-source-00002_stego_R.bmp stego_image = os.path.join(stego_image_folders[i], stego_image_filelist[j]) # ex. 01-source-00002_stego_R stego_image_name = os.path.splitext(stego_image_filelist[j])[0] channel = stego_image_name.split('_')[-1] output_message_file = stego_image_name + '.txt' message_file = os.path.join(message_dir_channel[channel], output_message_file) #print(message_file) extract(stego_image, message_file, channel) print('Done.\n')
def scan(filepath, dirpath, log): extract(filepath, dirpath) extracted_dirpath = dirpath + '/_' + basename(filepath) + '.extracted' if not isdir(extracted_dirpath): return [(filepath, False)] files = listdir(extracted_dirpath) binary_files = [ join(extracted_dirpath, f) for f in files if isfile(join(extracted_dirpath, f)) ] log('Found {} embedded files in {}'.format(len(binary_files), filepath)) return [scan_file(f, log) for f in binary_files]
def extractPackage(package, tarballsDir, sourcesDir, patchesDir): if not isdir(sourcesDir): makedirs(sourcesDir) sourceDirName = package.getSourceDirName() packageSrcDir = joinpath(sourcesDir, sourceDirName) if isdir(packageSrcDir): rmtree(packageSrcDir) extract(joinpath(tarballsDir, package.getTarballName()), sourcesDir, TopLevelDirRenamer(sourceDirName)) diffPath = joinpath(patchesDir, sourceDirName + ".diff") if isfile(diffPath): for diff in Diff.load(diffPath): patch(diff, sourcesDir) print "Patched:", diff.getPath()
def extractPackage(package, tarballsDir, sourcesDir, patchesDir): if not isdir(sourcesDir): makedirs(sourcesDir) sourceDirName = package.getSourceDirName() packageSrcDir = joinpath(sourcesDir, sourceDirName) if isdir(packageSrcDir): rmtree(packageSrcDir) extract(joinpath(tarballsDir, package.getTarballName()), sourcesDir, TopLevelDirRenamer(sourceDirName)) diffPath = joinpath(patchesDir, sourceDirName + '.diff') if isfile(diffPath): for diff in Diff.load(diffPath): patch(diff, sourcesDir) print 'Patched:', diff.getPath()
def update_datadir(self, datadir): logger.info('Entering {}'.format(datadir)) h_idx_file = items.IndexFile(os.path.join(self.data_root, datadir, self.h_index_fname)) h_idx_file_rev = h_idx_file.header.revision logger.info('Index revision is {}'.format(h_idx_file_rev)) db_dir_entry = self.db['datadirs'][datadir] # Skip if revision has not changed if db_dir_entry['revision'] == h_idx_file_rev: logger.info('Revision unchanged, nothing to update') return cur_sec_idx = db_dir_entry['cur_section'] for sec in u.full_circle(h_idx_file.sections, cur_sec_idx): logger.debug('Entering section {}'.format(sec.idx)) if sec.idx == cur_sec_idx: next_vrec_idx = db_dir_entry['last_vrec'] + 1 else: next_vrec_idx = 0 next_vrecs = u.islice_from(sec.video_records, next_vrec_idx) for i, vrec in enumerate(next_vrecs): if vrec.start_dt == datetime.utcfromtimestamp(0): logger.debug( 'Skipping extraction of incomplete vrecat {}:{:x}' .format(vrec._h_idx_file.name, vrec._pos) ) continue try: extract(vrec) db_dir_entry['last_vrec'] = next_vrec_idx + i db_dir_entry['cur_section'] = sec.idx self.db['cur_datadir'] = datadir self.db.save() except FileExistsError as e: logger.info( 'File {} exists, will not overwrite' .format(e.filename) ) logger.info('Done processing revision {}'.format(h_idx_file_rev)) db_dir_entry['revision'] = h_idx_file_rev self.db.save()
def match(filePath, fileName): fn = fileName.split('.')[0] ''' #生成parse和depend文件 corpusProcess.segment(filePath+fileName, "data/"+fn+"_分词.txt") os.system("java -jar nlp.jar " + "data/ " + fn+"_分词.txt") os.remove("data/"+fn+"_分词.txt") corpusProcess.parse("data/"+fn+"_句法分析.txt", "data/"+fn+"_parse.txt") corpusProcess.depend("data/"+fn+"_依存关系.txt", "data/"+fn+"_depend.txt") ''' #读取句子,parse和depend with open(filePath+fileName, 'r', encoding="utf8") as f: sentences = f.readlines() with open("data/"+fn+"_parse.txt", 'r', encoding="utf8") as pf: parseJson = pf.readlines() with open("data/"+fn+"_depend.txt", 'r', encoding="utf8") as df: dependJson = df.readlines() parseCommon, dependCommon = loadCommon("data/"+fn+"_parse.txt", "data/"+fn+"_depend.txt") #判断每句话是否符合模式 vecPOS = [] vecEmo = [] vecPAD = [] for i in range(len(sentences)): #是否符合关键词+词性标注模式 if matchPOS(sentences[i]): vecPOS.append(1) else: vecPOS.append(0) #是否符合情感标注模式 if matchEmo(sentences[i]): vecEmo.append(1) else: vecEmo.append(0) #是否符合句法+依存关系模式 count = 0 parse = json.loads(parseJson[i]) for key in parse.keys(): if key in parseCommon: count += 1 depend = json.loads(dependJson[i]) for key in depend.keys(): if key in dependCommon: count += 1 if count >= 35: vecPAD.append(1) else: vecPAD.append(0) #观点句抽取 extract.extract(vecPOS, filePath, fileName) return vecPOS, vecEmo, vecPAD
def main(): args = parse_args() try: if args.action == "extract": if args.verbose: print "Extracting archive" extract.extract(infile=args.input, outfile=args.output, verbose=args.verbose) elif args.action == "archive": if args.verbose: print "Creating archive" archive.archive(infile=args.input, compression=args.compression, outfile=args.output, verbose=args.verbose) except (extract.ExtractException, archive.ArchiveException) as ex: print >> sys.stderr, ex.msg return ex.code return 0
def sim2(): resolution = 0.006 acc = extract.extract('./B_Accelerometer_data/jog_9/sub_3.csv', "acc") localangular = extract.extract('./C_Gyroscope_data/jog_9/sub_3.csv', "gyro") acc = acc[0:min(len(acc), len(localangular))] localangular = localangular[0:min(len(acc), len(localangular))] print(len(acc)) print(len(localangular)) assert (len(acc) == len(localangular)) steps = len(acc) new_sim = Dead_Reckoning.Dead_Reckoning(acc, localangular, resolution, steps) new_sim.simulate() new_sim.plot_traj()
def xinlang_spider(self, url): ''' 新浪论坛爬虫 ''' r = requests.get(url) if r.status_code == 200: count = extract('<font color="#ff0000"> ', '</font>', r.text) return int(count.replace(',', ''))
def last(): home = get_response(host="opslinux.com",url="/") content = extract_all('<article>','</article>',home) for item in content: title_html = extract('<a href="','</a>',item) title = title_html.split('">') print "标题: %s \n地址: %s\n" % (title[1],title[0])
def test_line_extract_4(self): line = """2015-03-04 03:13:51 125.122.116.68 POST /revue/JCHA/1995/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 125.122.116.68 "" "-" 200 6387""" record = extract(line, JournalReferential([])) self.assertIsNotNone(record) self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 3, 13, 51))) self.assertEqual(record.proxy_ip, "125.122.116.68") self.assertEqual(record.http_method, "POST") self.assertEqual(record.user_ip, "125.122.116.68") self.assertEqual(record.country, "CN") self.assertEqual(record.continent, "AS") self.assertEqual(record.timezone, "Asia/Shanghai") self.assertEqual(record.geo_coordinates, "30.2936, 120.1614") self.assertEqual(record.url, "/revue/JCHA/1995/v6/n1/031091ar.pdf") self.assertEqual(record.raw_user_agent, "") self.assertEqual(record.browser, "") self.assertEqual(record.os, "") self.assertEqual(record.device_type, "") self.assertFalse(record.is_good_robot) self.assertEqual(record.referer, "") self.assertEqual(record.http_response_code, 200)
def allcore(inputarray,indexarray,coresize,kernel_width_total): core = [] for j in indexarray: c = ex.extract(j,inputarray,coresize,kernel_width_total) core.append(c) return core
def all(): archives = get_response(host="opslinux.com",url="/archives.html") content = extract_all('<article>','</article>',archives) for item in content: title_html = extract('<a href="','</a>',item) title = title_html.split('">') print "标题: %s \n地址: %s\n" % (title[1],title[0])
def test_line_extract_3(self): line = """2015-03-04 00:29:36 222.33.68.117 GET /revue/JCHA/2015/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 222.33.68.117 "-" "-" 400 460""" record = extract(line, JournalReferential([])) self.assertIsNotNone(record) self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 0, 29, 36))) self.assertEqual(record.proxy_ip, "222.33.68.117") self.assertEqual(record.http_method, "GET") self.assertEqual(record.user_ip, "222.33.68.117") self.assertEqual(record.country, "CN") self.assertEqual(record.continent, "AS") self.assertEqual(record.timezone, "Asia/Shanghai") self.assertEqual(record.geo_coordinates, "39.9289, 116.3883") self.assertEqual(record.url, "/revue/JCHA/2015/v6/n1/031091ar.pdf") self.assertEqual(record.raw_user_agent, "-") self.assertEqual(record.browser, "Other") self.assertEqual(record.os, "Other") self.assertEqual(record.device_type, "") self.assertFalse(record.is_good_robot) self.assertEqual(record.referer, "") self.assertEqual(record.http_response_code, 400) self.assertEqual(record.age, 0)
def test_line_extract_2(self): line = """2015-03-04 02:17:29 100.43.91.4 GET /revue/JCHA/2014/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 100.43.91.4 "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" "-" 200 6387""" record = extract(line, JournalReferential([])) self.assertIsNotNone(record) self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 2, 17, 29))) self.assertEqual(record.proxy_ip, "100.43.91.4") self.assertEqual(record.http_method, "GET") self.assertEqual(record.url, "/revue/JCHA/2014/v6/n1/031091ar.pdf") self.assertEqual(record.user_ip, "100.43.91.4") self.assertEqual(record.country, "US") self.assertEqual(record.continent, "NA") self.assertEqual(record.timezone, "America/Los_Angeles") self.assertEqual(record.geo_coordinates, "37.4135, -122.1312") self.assertEqual(record.journal_name, "jcha") # self.assertEqual(record.journal_domain, "") self.assertEqual( record.raw_user_agent, "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)") self.assertEqual(record.browser, "YandexBot") self.assertEqual(record.os, "Other") self.assertEqual(record.device_type, "") self.assertTrue(record.is_good_robot) self.assertEqual(record.referer, "") self.assertEqual(record.http_response_code, 200) self.assertEqual(record.age, 1)
def reduce_dither_pair(dither_a, dither_b, traces, trace_direction=1, lamp_image=None): '''dither_a and dither_b are two dither positions of the same source, already flat-fielded. traces is a list of initial guesses for trace parameters. trace_direction is 1 for a horizontal trace and 0 for a vertical trace.''' #p_init = composite_model(traces, model_type='gaussian') lamps = lamp_image != None pdb.set_trace() difference_image = im_subtract(dither_a, dither_b)[1] postrace, negtrace = fit_trace(difference_image, traces, tracedir=trace_direction) dither_a = fix_distortion(dither_a, postrace, trace_direction) dither_b = fix_distortion(dither_b, negtrace, trace_direction) difference_image = im_subtract(dither_a, dither_b)[1] all_profiles = fit_trace(difference_image, traces, tracedir=trace_direction) telluric_image = im_minimum(dither_a, dither_b)[1] return extract(all_profiles, difference_image, telluric_image, tracedir=trace_direction, lamps=lamps, lamp=lamp_image)
def get_permanent_wechat_article_url(self, sougou_url): """ 从搜狗的临时url获取永久url Args: sougou_url (str): "http://mp.weixin.qq.com/s?timestamp=1473815432&src=3&ver=1&signature=puOtJfG0mefG5o6Ls-bqDmML9ZjS5S6oDIhdUReNRm6*bIF9yINfCoXvB3btXzPEeUZvV8bdlSRTgKPx5Nsd6ZfzLK4Gv4X6z7te1EEo2azG3llx*rw*fxqXrKnwP2oqTTrNYxaRzM8cARFIbjPHVLpWdZGqNhyxsKoK5ozlXSk=" Returns: msg_link (str): "http://mp.weixin.qq.com/s?__biz=MzI1OTAwNDc1OA==&mid=2652831837&idx=1&sn=3a93c0b6dfeef85e9b85bdac39f47bce&chksm=f1942064c6e3a9728f0bdc4d9bab481b7079c7c1d9ed32397295b45d0b02af839dafcc4b093e#rd"; """ time.sleep(random.randint(1, 10)) curl_str = """ curl 'http://mp.weixin.qq.com/s?timestamp=1473815432&src=3&ver=1&signature=puOtJfG0mefG5o6Ls-bqDmML9ZjS5S6oDIhdUReNRm6*bIF9yINfCoXvB3btXzPEeUZvV8bdlSRTgKPx5Nsd6ZfzLK4Gv4X6z7te1EEo2azG3llx*rw*fxqXrKnwP2oqTTrNYxaRzM8cARFIbjPHVLpWdZGqNhyxsKoK5ozlXSk=' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Connection: keep-alive' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' --compressed """ _, headers, _ = parse_curl_str(curl_str) headers['User-Agent'] = random_ua() r = requests.get(sougou_url) html = r.text try: msg_link = xhtml_unescape(extract('msg_link = "', '";', html)) except Exception: self.logger.exception(html) msg_link = sougou_url self.logger.info('get permanent url: %s', msg_link) return msg_link
def stylecloud(request: StyleCloudRequest): params = request.dict() url = params.pop("url", None) text = params.pop("text", None) background_color = params.pop("background_color", None) gradient = params.pop("gradient", None) if gradient == Gradient.none: gradient = None if url is not None: result = extract(url) pprint.pprint(result) text = result["text"] elif text is None: raise Exception('Must provide either "text" or "url".') sc.gen_stylecloud(**params, text=text, gradient=gradient, icon_dir="/tmp/icons", output_name=OUTPUT_NAME, background_color=background_color.as_hex()) return FileResponse(OUTPUT_NAME, media_type="image/png", headers=headers)
def test_line_extract_2(self): line = """2015-03-04 02:17:29 100.43.91.4 GET /revue/JCHA/2014/v6/n1/031091ar.pdf HTTP/1.1 - 80 - 100.43.91.4 "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" "-" 200 6387""" record = extract(line, JournalReferential([])) self.assertIsNotNone(record) self.assertEqual(record.timestamp, get_montreal_time(datetime(2015, 3, 4, 2, 17, 29))) self.assertEqual(record.proxy_ip, "100.43.91.4") self.assertEqual(record.http_method, "GET") self.assertEqual(record.url, "/revue/JCHA/2014/v6/n1/031091ar.pdf") self.assertEqual(record.user_ip, "100.43.91.4") self.assertEqual(record.country, "US") self.assertEqual(record.continent, "NA") self.assertEqual(record.timezone, "America/Los_Angeles") self.assertEqual(record.geo_coordinates, "37.4135, -122.1312") self.assertEqual(record.journal_name, "jcha") # self.assertEqual(record.journal_domain, "") self.assertEqual(record.raw_user_agent, "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)") self.assertEqual(record.browser, "YandexBot") self.assertEqual(record.os, "Other") self.assertEqual(record.device_type, "") self.assertTrue(record.is_good_robot) self.assertEqual(record.referer, "") self.assertEqual(record.http_response_code, 200) self.assertEqual(record.age, 1)
def zhidao_spider(self, url): ''' 百度知道爬虫 ''' id = extract('http://zhidao.baidu.com/question/', '.html', url) url_ = 'http://zhidao.baidu.com/api/qbpv?q={id}'.format(id=id) r = requests.get(url_) if r.status_code == 200: return r.text
def main(): np.random.seed(12345) # read in the first few time series from the TIDIGITS dataset; the return # value is a collection of LabeledTimeSeries (see datasets.utils). You # will of course need to have the relevant dataset on your machine, as # well as update datasets/paths.py to point to it. For TIDIGITS # specifically, you will also need to have librosa installed. For the # UCR datasets, the whichExamples argument takes this many examples from # all 20 datasets whichExamples = np.arange(2) tsList = datasets.loadDataset(datasets.TIDIGITS, whichExamples=whichExamples) # uncomment any of these to use a different dataset # tsList = datasets.loadDataset(datasets.DISHWASHER, whichExamples=whichExamples) # tsList = datasets.loadDataset(datasets.MSRC, whichExamples=whichExamples) # tsList = datasets.loadDataset(datasets.UCR, whichExamples=[0]) Lmin, Lmax = 1. / 20, 1. / 10 # fractions of time series length for ts in tsList: startIdxs, endIdxs, model, featureMat, featureMatBlur = extract( ts.data, Lmin, Lmax) plotExtractOutput(ts, startIdxs, endIdxs, featureMat, model) # you can also call this if you just want to see what the data looks like # ts.plot() # plt.savefig(ts.name + '.pdf') # use this to save it plt.show()
def wrangle_reviews(path, userid=None): """ For a review xml file, extracts and loads into database """ userid = userid or userid_from_path(path) session = create_session() # Get user object user = User(id=userid) user = session.merge(user) with extract(path) as reviews: for review in reviews: book = Book(**review.get_book_data()) book = session.merge(book) for author in review.get_author_data(): author = Author(**author) author = session.merge(author) for data in review.get_book_authors_data(): book_author = BookAuthor(**data) book_author = session.merge(book_author) review = review.get_book_reviews_data() review.update({'user_id': userid}) review = Review(**review) review = session.merge(review) session.commit() session.close()
def extract_and_report(argv, html=False, matching_only=True): """Extract all credit card numbers from a list of plain text files and produce a report. @see: L{BincodesDB.fetch} @type argv: list(str) @type html: bool @type matching_only: bool @param argv: List of filenames, glob wildcards, or the special value "-". See: L{extract.listfiles} @param html: C{True} for an HTML report, C{False} for a plain text report. @param matching_only: C{True} to show only credit cards that match known bincodes, C{False} to show all credit cards. @rtype: iterator of (str, Table) @return: Yields tuples with the filename and the report for that file. """ found = set() bincodes = BincodesDB() try: for filename in listfiles(argv): if filename != '-': data = open(filename, 'r').read() else: data = sys.stdin.read() table = Table(html, header_row) for cc in extract(data): if cc not in found: row = list(bincodes.fetch(cc)) if not matching_only or row[1] is not None: table.add_row(row) found.add(cc) yield (filename, table) finally: bincodes.close()
def rear(): if request.method == 'POST': extraction = extract.extract(request.files['file'], 'rear') write('rear', extraction['data']) return extraction return render_template('rear.html')
def update_file(base_dir, uuid, real_path): hasher = hashlib.sha1() try: with open(real_path, "rb") as afile: stat = os.fstat(afile.fileno()) size = stat.st_size mtime = stat.st_mtime buf = afile.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = afile.read(blocksize) except IOError:# ファイルが絶妙なタイミングで削除されたなど logging.exception("calculating hash") with oscar.context(base_dir, oscar.min_free_blocks) as context: delete.delete_by_uuid(context, uuid) row = {"_key":uuid, "size":size, "mtime":mtime, "dirty":False} hashval = hasher.hexdigest() extracted_content = None if fulltext_already_exists(base_dir, hashval): #logging.debug("Fulltext already exists %s" % hashval) row["fulltext"] = hashval else: try: if size <= fulltext_max_file_size: # ファイルサイズが規定値以下の場合に限りfulltextをextractする extracted_content = extract.extract(real_path) except Exception, e: # 多様なフォーマットを扱うためどういう例外が起こるかまるでわからん log.create_log(base_dir, "extract", u"%s (%s): %s" % (real_path.decode("utf-8"), hashval, e.message.decode("utf-8")))
def front(): if request.method == 'POST': extraction = extract.extract(request.files['file'], 'front') write('front', extraction['data']) return extraction return render_template('front.html')
def img_to_txt(filename=''): if filename == '': #default image img = cv2.imread('./static/ku.jpg') else: print('not using deafult bro') img = cv2.imread('.' + filename) print('.' + filename) img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) #show histogram # plt.hist(img.ravel(), 256, [0, 256]) # plt.show() retval, img = cv2.threshold(img, 100, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # cv2.imshow('img',img) # cv2.waitKey(0) # cv2.destroyAllWindows() cv2.imwrite('./static/processed_image.jpg', img) text = pytesseract.image_to_string(img) data = extract(text) return data
def main(): np.random.seed(12345) # read in the first few time series from the TIDIGITS dataset; the return # value is a collection of LabeledTimeSeries (see datasets.utils). You # will of course need to have the relevant dataset on your machine, as # well as update datasets/paths.py to point to it. For TIDIGITS # specifically, you will also need to have librosa installed. For the # UCR datasets, the whichExamples argument takes this many examples from # all 20 datasets whichExamples = np.arange(2) tsList = datasets.loadDataset(datasets.TIDIGITS, whichExamples=whichExamples) # uncomment any of these to use a different dataset # tsList = datasets.loadDataset(datasets.DISHWASHER, whichExamples=whichExamples) # tsList = datasets.loadDataset(datasets.MSRC, whichExamples=whichExamples) # tsList = datasets.loadDataset(datasets.UCR, whichExamples=[0]) Lmin, Lmax = 1./20, 1./10 # fractions of time series length for ts in tsList: startIdxs, endIdxs, model, featureMat, featureMatBlur = extract( ts.data, Lmin, Lmax) plotExtractOutput(ts, startIdxs, endIdxs, featureMat, model) # you can also call this if you just want to see what the data looks like # ts.plot() # plt.savefig(ts.name + '.pdf') # use this to save it plt.show()
def generate_features(featureGenerator, directory): originalPath = os.path.dirname(sys.argv[0]) os.getcwd() os.chdir(directory) print 'generating features for dir:', directory programPath = os.path.join(originalPath,"build", featureGenerator) featuresDir = os.path.join(directory, featureGenerator.replace(".exe","") + "/") if not os.path.exists(featuresDir): os.mkdir(featuresDir) for i in os.listdir(os.getcwd()): if i in gesturesAll: gestureDir = os.path.join(featuresDir, i) if not os.path.exists(gestureDir): os.mkdir(gestureDir) print i os.chdir(directory + "/" + i) #for dataFile in os.listdir(os.getcwd()): for dataFile in glob.glob(os.path.join(os.getcwd(),'*.avi')): realpath = os.path.realpath(dataFile) result = extract.extract(realpath, programPath) basename =os.path.splitext((os.path.basename(dataFile)))[0] + ".txt" outfile = os.path.join(gestureDir, basename) print outfile try: os.remove(outfile) except OSError: pass f = open(outfile, 'w') f.write(result) f.close() os.chdir(originalPath)
def test_extract(): """ :Author: Tim Hoer :Date: November 20, 2017 :Notes: Tests that function loads all images from input directory and stores them as instances of the lesion class. """ import os import urllib.request import shutil import tempfile from extract import extract from Image import Image # create temporary directory test_dir = tempfile.mkdtemp() #test extract on empty directory #assertRaises(Exception,extract,test_dir) # upload images to temporary directory fullfilename = os.path.join(test_dir, 'puppy.jpg') urllib.request.urlretrieve( "http://www.zarias.com/wp-content/uploads/2015/12/61-cute-puppies.jpg", fullfilename) fullfilename = os.path.join(test_dir, 'kitten.jpg') urllib.request.urlretrieve( "http://weknowyourdreams.com/images/kittens/kittens-02.jpg", fullfilename) # call function out = extract(test_dir) # check that output array is instance of lesion class assert (len(out) == 2) assert (isinstance(out[0], Image) is True) # remove temporary directory shutil.rmtree(test_dir)
def testCheckFlipkart(self): ''' The extract function should return a dict on execution with urls having the xpath in the website_csv list ''' item_name = "Brica Pop Open Cling Sun Shade" item_name_extracted = str(extract('http://www.flipkart.com/brica-pop-open-cling-sun-shade/p/itme2znucyhn7un2?pid=SUDE2ZNUDUFMJ36M&srno=b_1&offer=DOTDOnAutomotive_Jan21.&ref=75afa3e4-e5b7-425a-92f1-745b4f6b7f99')['name']).replace("\n","").replace(" ","") self.assertEqual(item_name.replace(" ","") , item_name_extracted)
def test_it_saves_to_the_database(): sheets = extract.validate(extract.extract('fixture/simple.xlsx')) extract.save(sheets) data = scraperwiki.sql.select('* from Sheet1') row = data[2] assert_equals(row['Year'], 2012) assert_equals(row['Awesomeness'], 8)
def test_it_saves_a_unicode_csv_to_the_database(): sheets = extract.validate(extract.extract('fixture/mps_unicode.csv')) extract.save(sheets) data = scraperwiki.sql.select('* from swdata') row = data[460] assert_equals(row['MP Name'], 'Michelle Gildernew') assert_equals(row['Party'], u'Sinn Féin')
def get_all_tag_urls(url='http://www.sharejs.com/codes/'): html = requests.get(url).content.decode('utf-8') tag_urls = extract_all('<a href="', '"', extract('<div class="tags_cloud">', '</ul>', html)) base_url = 'http://www.sharejs.com%s' tag_urls = [base_url % i for i in tag_urls] tag_urls = [i + '?start=0' for i in tag_urls] return tag_urls
def testCheckFlipkart(self): ''' The extract function should return a dict on execution with urls having the xpath in the website_csv list ''' item_name = "Scullers Men's Checkered Casual Shirt" item_name_extracted = str(extract('http://www.flipkart.com/scullers-men-s-checkered-casual-shirt/p/itmduvc4fpgtktkf?pid=SHTDUJF6XSSNB92T&srno=b_1&ref=884be278-844c-4a29-b300-b0c131dfddb0')['name']).replace("\n","").replace(" ","") self.assertEqual(item_name.replace(" ","") , item_name_extracted)
def test_it_can_extract_a_unicode_csv(): sheets = extract.validate(extract.extract('fixture/mps_unicode.csv')) assert_equals(len(sheets), 1) sheet = sheets['swdata'] assert_equals(len(sheet), 653) row = sheet[460] assert_equals(row['MP Name'], 'Michelle Gildernew') assert_equals(row['Party'], u'Sinn Féin')
def handle_html(self, url, html): html = html.decode('utf-8') url_list = extract_all('<a href="', '"', extract('<div class="code_list">', '</ul>', html)) article_list = [i for i in url_list if 'author' not in i] base_url = 'http://www.sharejs.com' article_list = [base_url+i for i in article_list] article_list.pop(0) self.results.extend(article_list)
def textChanger(pdfText, mostAuthor="", mostPaper="",extractOptions=["nltk",5,5,5],devMode=False): """"Takes the semi-cleaned text of a pdf and extracts the desired portions. Output in markdown suitable for display on the website.""" pdfText = pre_clean.pre_clean(pdfText) if mostAuthor: mostAuthor = evaluator(authorCounter(pdfText)) if mostPaper: mostPaper = evaluator(paperCounter(pdfText)) ex = extract(pdfText,extractOptions) return ex