def main(sc): """Get information about when the samples were sequenced. INPUT: The times to search for in a list. The folders containing the books, also in a list. FUNCTION: main() OUTPUT: Around 10 seconds per book for an obscure time.""" from get_times import get_times times_to_get = ['04:25', '11:29'] to_lookup = {} all_times = [] for time in times_to_get: some_times = get_times(time) for item in some_times: to_lookup[item] = time all_times = all_times + some_times import time start_time = time.time() with open('time_results.tsv', 'wb') as f: for i in ['books/']: lines = sc.wholeTextFiles(str(i) + '/*.txt') print i, (time.time() - start_time) noon = lines.filter(lambda a: any(x in a[1] for x in all_times)) a = noon.collectAsMap() for key in a: uemp = str(key) temp = uemp.split('/') semp = temp[-1].split('.txt') remp = semp[0].split(' - ') pemp = remp[-1].split('[') oemp = pemp[0].split('(') title = oemp[0] if ',' in remp[-1]: qemp = remp[0].split(', ') author = qemp[-1] + ' ' + qemp[0] else: author = remp[0] to_parallel = a[key] sentences = sc.parallelize(to_parallel.split('. ')) midday = sentences.filter(lambda a: any(x in a for x in all_times)) for sentence in midday.collect(): new_sentence = sentence.replace('\r\n', ' ') new_sentence = new_sentence.replace('\n', ' ') new_sentence = new_sentence.replace('\t', ' ') contained = [x for x in all_times if x in new_sentence] new_title = title.replace('_', ':') new_author = author.replace('_', '.') f.write(to_lookup[contained[0]] + '\t' + new_sentence.encode('utf8') + '.\t' + new_title + '\t' + new_author + '\n') to_parallel = []
def get_chart(chart_type, project_root, request_tups = None): parallel_curl = pyparallelcurl.ParallelCurl(MAX_REQUESTS, CURL_OPTIONS) if request_tups==None: request_tups = get_times.get_times() total_request_tups = len(request_tups)*10 current_request_no = 1 for request_tup in request_tups: for page_no in range(1,11): filename = project_root + "raw_downloaded/" + chart_type + "/"+request_tup[0]+"/"+request_tup[1]+"/" + str(request_tup[2]) + "-" + str(request_tup[3]) + "_page-" + str(page_no) + ".json" if not os.path.isfile(filename): print filename + "does not exist" if chart_type == "top_tracks": api_method = "geo.getmetrotrackchart" elif chart_type == "top_artists": api_method = "geo.getmetroartistchart" else: raise Exception, "Valid scrape types are: top_tracks, top_artists" try: params = tuple([api_method] + list(request_tup)) request = "http://ws.audioscrobbler.com/2.0/?method=%s&format=json&country=%s&metro=%s&start=%d&end=%d" % params except: print params raise Exception request = request + "&api_key="+API_KEY + "&page=" + str(page_no) print str(current_request_no) + "/" + str(total_request_tups) + ": " + request cookie = {"country": request_tup[0], "city": request_tup[1], "start": request_tup[2], "end": request_tup[3], "page": page_no, "type": chart_type, "project_root": project_root} try: parallel_curl.startrequest(request, on_request_done_save, cookie) except TypeError: parallel_curl.startrequest(request.encode("utf-8"), on_request_done_save, cookie) else: pass if (current_request_no % 5000) == 0: print str(current_request_no) + " of " + str(total_request_tups) + " complete." current_request_no += 1
def main(project_root): request_tups = get_times.get_times(project_root) scrape_metros.scrape_all_charts(project_root, request_tups) write_metros(request_tups, project_root) tags_charts = scrape_tags.scrape_tags_charts(project_root)