コード例 #1
0
ファイル: find_times.py プロジェクト: jpwhalley/literaryclock
def main(sc):
    """Get information about when the samples were sequenced.
		INPUT: The times to search for in a list. The folders containing the books, also in a list.
		FUNCTION: main()
		OUTPUT: Around 10 seconds per book for an obscure time."""
    
    from get_times import get_times
    times_to_get = ['04:25', '11:29']
    to_lookup = {}
    all_times = []
    for time in times_to_get:
        some_times = get_times(time)
        for item in some_times:
            to_lookup[item] = time
        all_times = all_times + some_times
            
    import time
    start_time = time.time()
    with open('time_results.tsv', 'wb') as f:
        for i in ['books/']:
            lines = sc.wholeTextFiles(str(i) + '/*.txt')
            print i, (time.time() - start_time)
            noon = lines.filter(lambda a: any(x in a[1] for x in all_times))
            a = noon.collectAsMap()
            for key in a:
                uemp = str(key)
                temp = uemp.split('/')
                semp = temp[-1].split('.txt')
                remp = semp[0].split(' - ')
                pemp = remp[-1].split('[')
                oemp = pemp[0].split('(')
                title = oemp[0]
                if ',' in remp[-1]:
                    qemp = remp[0].split(', ')
                    author = qemp[-1] + ' ' + qemp[0]
                else:
                    author = remp[0]
                to_parallel = a[key]
                sentences = sc.parallelize(to_parallel.split('. '))
                midday = sentences.filter(lambda a: any(x in a for x in all_times))
                for sentence in midday.collect():
                    new_sentence = sentence.replace('\r\n', ' ')
                    new_sentence = new_sentence.replace('\n', ' ')
                    new_sentence = new_sentence.replace('\t', ' ')
                    contained = [x for x in all_times if x in new_sentence]
                    new_title = title.replace('_', ':')
                    new_author = author.replace('_', '.')
                    f.write(to_lookup[contained[0]] + '\t' + new_sentence.encode('utf8') + '.\t' + new_title + '\t' + new_author + '\n')
                to_parallel = []
コード例 #2
0
def get_chart(chart_type, project_root, request_tups = None):
    parallel_curl = pyparallelcurl.ParallelCurl(MAX_REQUESTS, CURL_OPTIONS)
    if request_tups==None:
        request_tups = get_times.get_times()
    total_request_tups = len(request_tups)*10
    current_request_no = 1
    for request_tup in request_tups:
        for page_no in range(1,11):
            filename = project_root + "raw_downloaded/" + chart_type + "/"+request_tup[0]+"/"+request_tup[1]+"/" + str(request_tup[2]) + "-" + str(request_tup[3]) + "_page-" + str(page_no) + ".json"
            if not os.path.isfile(filename):
                print filename + "does not exist"
                if chart_type == "top_tracks":
                    api_method = "geo.getmetrotrackchart"
                elif chart_type == "top_artists":
                    api_method = "geo.getmetroartistchart"
                else:
                    raise Exception, "Valid scrape types are: top_tracks, top_artists"
                try:
                    params = tuple([api_method] + list(request_tup))
                    request = "http://ws.audioscrobbler.com/2.0/?method=%s&format=json&country=%s&metro=%s&start=%d&end=%d" % params
                except:
                    print params
                    raise Exception
                request = request + "&api_key="+API_KEY + "&page=" + str(page_no)
                print str(current_request_no) + "/" + str(total_request_tups) + ": " + request
                cookie = {"country": request_tup[0],
                          "city": request_tup[1],
                          "start": request_tup[2],
                          "end": request_tup[3],
                          "page": page_no,
                          "type": chart_type,
                          "project_root": project_root}
                try:
                    parallel_curl.startrequest(request, on_request_done_save, cookie)
                except TypeError:
                    parallel_curl.startrequest(request.encode("utf-8"), on_request_done_save, cookie)
            else:
                pass
            if (current_request_no % 5000) == 0:
                print str(current_request_no) + " of " + str(total_request_tups) + " complete."
            current_request_no += 1
コード例 #3
0
def main(project_root):
    request_tups = get_times.get_times(project_root)
    scrape_metros.scrape_all_charts(project_root, request_tups)
    write_metros(request_tups, project_root)
    tags_charts = scrape_tags.scrape_tags_charts(project_root)