Example #1
0
def main(proc_num, queue, out_dir, in_dir):
    merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl") 
    print proc_num, "Start loop"
    while True:
        try: 
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Fixing counts for year", year
        fixed_counts = {}
        old_mat = matstore.retrieve_mat_as_dict(in_dir + str(year) + ".bin")
        old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl") 
        for pair, count in old_mat.iteritems():
            try:
                i_word = old_index[pair[0]]
            except IndexError:
                print pair
                sys.exit(0)
            c_word = old_index[pair[1]]
            new_pair = (indexing.word_to_static_id(i_word, merged_index), 
                    indexing.word_to_static_id(c_word, merged_index))
            fixed_counts[new_pair] = count
        
        print proc_num, "Writing counts for year", year
        matstore.export_mats_from_dicts({str(year) : fixed_counts}, out_dir)
Example #2
0
def main(proc_num, queue, out_dir, in_dir):
    merged_index = ioutils.load_pickle(out_dir + "merged_index.pkl")
    print proc_num, "Start loop"
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "Fixing counts for year", year
        fixed_counts = {}
        old_mat = matstore.retrieve_mat_as_dict(in_dir + str(year) + ".bin")
        old_index = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        for pair, count in old_mat.iteritems():
            try:
                i_word = old_index[pair[0]]
            except IndexError:
                print pair
                sys.exit(0)
            c_word = old_index[pair[1]]
            new_pair = (indexing.word_to_static_id(i_word, merged_index),
                        indexing.word_to_static_id(c_word, merged_index))
            fixed_counts[new_pair] = count

        print proc_num, "Writing counts for year", year
        matstore.export_mats_from_dicts({str(year): fixed_counts}, out_dir)
Example #3
0
def main(proc_num, lock, download_dir, source):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for url in urls:
            name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1)
            dirs = set(os.listdir(download_dir))
            if name in dirs:
                continue

            work_left = True
            print proc_num, "Name", name
            loc_dir = download_dir + "/" + name + "/"
            ioutils.mkdir(loc_dir)
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Downloading", name

        success = False
        while not success:
            with open(loc_dir + name + '.csv.zip', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(
            ['unzip', '-o', loc_dir + name + '.csv.zip', '-d', loc_dir])
        subprocess.call([
            'mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' +
            VERSION + '-' + name + '.csv', loc_dir + name
        ])

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        n = 0
        with open(loc_dir + name) as f:
            for l in f:
                split = l.strip().split('\t')
                try:
                    ngram = split[0].split()
                    middle_index = len(ngram) // 2
                    item = ngram[middle_index]
                    context = ngram[:middle_index] + ngram[middle_index + 1:]
                    item_id = indexing.word_to_id(item, index)
                    year = split[1]
                    count = int(split[2])
                    for context_word in context:
                        pair = (item_id,
                                indexing.word_to_id(context_word, index))
                        year_counters[year][pair] += count
                except:
                    pass

        print proc_num, "Writing", name, n
        matstore.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")

        print proc_num, "Deleting", name
        try:
            os.remove(loc_dir + name)
            os.remove(loc_dir + name + '.csv.zip')
        except:
            pass
Example #4
0
def main(proc_num, lock, page, download_dir, source):
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for url in urls:
            if EXCLUDE_PATTERN.match(url):
                continue
            name = re.search('%s-(.*).gz' % VERSION, url).group(1)
            dirs = set(os.listdir(download_dir))
            if name in dirs:
                continue

            work_left = True
            print proc_num, "Name", name
            loc_dir = download_dir + "/" + name + "/"
            ioutils.mkdir(loc_dir)
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Downloading", name

        success = False
        while not success:
            with open(loc_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d'])
#        subprocess.call(['mv', loc_dir + 'googlebooks-' + source + '-' +  TYPE + '-' + VERSION + '-' + name + '.csv', loc_dir + name])

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        skipped = 0
        with open(loc_dir + name) as f:
            for l in f:
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                try:
                    ngram = split[0].split()
                    middle_index = len(ngram) // 2
                    item = ngram[middle_index]
                    context = ngram[:middle_index] + ngram[middle_index + 1:]
                    item_id = indexing.word_to_id(item, index)
                    year = split[1]
                    count = int(split[2])
                    for context_word in context:
                        pair = (item_id, indexing.word_to_id(context_word, index))
                        year_counters[year][pair] += count
                except:
                    skipped += 1
                    pass

        print proc_num, "Writing", name, "Skipped", skipped
        matstore.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")

        print proc_num, "Deleting", name
        try:
            os.remove(loc_dir + name)
            os.remove(loc_dir + name + '.gz')
        except:
            pass