def batch_scrape(session): scrape.run(os.path.join(app.config['UPLOAD_FOLDER'], session['sid']), batch=True, output=os.path.join(app.config['SCRAPE_OUTPUT_FOLDER'], session['sid'])) zip_files(session['sid']) print("BATCH DONE", file=sys.stderr)
def terms_test(): label = "test" # terms = ["apple", "banana", "carrot", "dragonfruit", "edamame", "fennel", "ginger"] terms = [("asparagus","_"), ("broccoli","_")] # terms = [("hong kong", "香港")] # df = DataFrame([{"english": term, "chinese": "test", "label": label} for term in terms]) df = DataFrame([{"english": en_term, "chinese": cn_term, "label": label} for en_term,cn_term in terms]) run(termlist=df, shuffle=True)
def single_scrape(session): scrape.run(os.path.join( os.path.join(app.config['UPLOAD_FOLDER'], session['sid'], session['filename'])), batch=False, output=os.path.join(app.config['SCRAPE_OUTPUT_FOLDER'], session['sid'])) print("DONE", file=sys.stderr)
def scrape_func(): return scrape.run()
def start(): # this didn't need to be an api but here we are. run(page_number=1, min_date=end_date, max_date=from_date) return "finished!"
# this gets rid of tags and condenses whitespace def striptags(s): s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) s = re.sub(r"\&\#160\;", " ", s) #return condense(re.sub(r"\<[^\>]*\>", " ", s)) return condense(s) def getUrlArgs(parseUrl): return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups() if(len(sys.argv) == 1): # called with no arguments print("Usage: ", sys.argv[0], " url [n]") print(" (where n indicates which html table to parse)") exit(1) url = sys.argv[1] soup = opensoup(url) tables = soup.findAll("table") #, {"class":"wikitable sortable"}) for table in tables: for r in table.findAll('tr'): rl = [] for c in r.findAll(re.compile('td|th')): rl.append(striptags(c.renderContents())) if len(rl) > 1 and "href" in rl[1]: print('! ' + stripurl(rl[1])) scrapeUrl='http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_table4-2-' + getUrlArgs(rl[1])[0] + "-" + getUrlArgs(rl[1])[1] + '.shtml' scrape.run(scrapeUrl)