Exemple #1
0
def main(proc_num, queue, out_dir, download_dir, context_size):
    print proc_num, "Start loop"
    while True:
        if queue.empty():
            break
        name = queue.get()
        loc_dir = out_dir + "/" + name + "/"
        ioutils.mkdir(loc_dir)

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        time.sleep(120 * random.random())
        with open(download_dir + name) as f:
            for i, l in enumerate(f):
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()]
                year = split[1]
                count = int(split[2])
                if context_size == 2:
                    year_counters = update_count(ngram, 2, year, count, year_counters)
                elif context_size == 4:
                    year_counters = update_count(ngram, 0, year, count, year_counters)
                    year_counters = update_count(ngram, 4, year, count, year_counters)
                else:
                    raise Exception("Unsupported context size")

        print proc_num, "Writing", name
        time.sleep(120 * random.random())
        sparse_io.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")
Exemple #2
0
def split_main(proc_num, queue, download_dir):
    print(proc_num, "Start loop")
    while True:
        if queue.empty():
            break
        url = queue.get()
        name = re.search('%s-(.*).gz' % VERSION, url).group(1)
        dirs = set(os.listdir(download_dir))
        if name in [file.split("-")[0] for file in dirs]:
            continue

        print(proc_num, "Name", name)
        loc_dir = download_dir + "/" + name + "/"
        ioutils.mkdir(loc_dir)

        print(proc_num, "Downloading", name)
        success = False
        while not success:
            with open(loc_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib.request.urlopen(url, timeout=60).read())
                    success = True
                except:
                    print("Fail!!")
                    continue

        print(proc_num, "Unzipping", name)
        subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d'])
        print(proc_num, "Splitting", name)
        subprocess.call([
            "split", "-l",
            str(LINE_SPLIT), loc_dir + name, download_dir + "/" + name + "-"
        ])
        os.remove(loc_dir + name)
        os.rmdir(loc_dir)
def split_main(proc_num, queue, download_dir):
    print proc_num, "Start loop"
    while True:
        if queue.empty():
            break
        url = queue.get()
        name = re.search('%s-(.*).gz' % VERSION, url).group(1)
        dirs = set(os.listdir(download_dir))
        if name in [file.split("-")[0] for file in dirs]:
            continue

        print proc_num, "Name", name
        loc_dir = download_dir + "/" + name + "/"
        ioutils.mkdir(loc_dir)

        print proc_num, "Downloading", name
        success = False
        while not success:
            with open(loc_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    print "Fail!!"
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d'])
        print proc_num, "Splitting", name
        subprocess.call(["split", "-l", str(LINE_SPLIT), loc_dir + name, download_dir + "/" +  name + "-"])
        os.remove(loc_dir + name)
        os.rmdir(loc_dir)
def worker(proc_num, queue, dir, count_dir, min_count, checkpoints):
    while True:
        if queue.empty():
            break
        year = queue.get()
        freqs = load_pickle(count_dir + str(year) + "-counts.pkl")
        for n in checkpoints:
            out_dir =dir + '{:03d}'.format(n) + "/"
            mkdir(out_dir)
            subprocess.call(['mv', dir + str(year) + '-w.' + '{:03d}'.format(n), out_dir + str(year) + '-w'])
            print "Loading data..", year, "iterations", n
            text2numpy(out_dir, freqs, year)
Exemple #5
0
def run_parallel(num_procs, out_dir, in_dir, years):
    ioutils.mkdir(out_dir)
    lock = Lock()
    procs = [
        Process(target=main,
                args=[i, lock, out_dir + "/", in_dir + "/", years])
        for i in range(num_procs)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
Exemple #6
0
def run_parallel(num_processes, root_dir, source, context_size):
    queue = Queue()
    download_dir = root_dir + '/' + source + '/raw/'
    out_dir = root_dir + '/' + source + '/c' + str(context_size) + '/raw/'
    ioutils.mkdir(out_dir)

    for name in os.listdir(download_dir):
        queue.put(name)
    procs = [Process(target=main, args=[i, queue, out_dir, download_dir, context_size]) for i in range(num_processes)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
def run_parallel(num_processes, in_dir, out_dir):
    queue = Queue()
    ioutils.mkdir(out_dir)

    for zipped_file in os.listdir(in_dir):
        if not os.path.isfile(in_dir + "/" + zipped_file) or not zipped_file.endswith((".gz")):
            continue
        queue.put(zipped_file)

    procs = [Process(target=split_main, args=[i, queue, in_dir, out_dir]) for i in range(num_processes)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
def run_parallel(num_processes, root_dir, out_dir, context_size, is_zipped):
    queue = Queue()
    download_dir = root_dir + '/'
    out_dir = out_dir + '/c' + str(context_size) + '/raw/'
    ioutils.mkdir(out_dir)

    for name in os.listdir(download_dir):
        if name == ".DS_Store":
            continue
        queue.put(name)
    procs = [Process(target=main, args=[i, queue, out_dir, download_dir, context_size, is_zipped]) for i in range(num_processes)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
def split_main(proc_num, queue, download_dir, out_dir_g, context_size):
    print proc_num, "Start loop"
    while True:  # iterates throug the urls of the datafiles
        if queue.empty():
            break
        url = queue.get()
        name = re.search('%s-(.*).gz' % VERSION,
                         url).group(1)  # gets the name of the individual file
        dirs = set(os.listdir(download_dir))
        if name in [file.split("-")[0] for file in dirs]:
            continue

        print proc_num, "Name", name  # creates the directiory for the downloading file
        loc_dir = download_dir + "/" + name + "/"
        ioutils.mkdir(loc_dir)

        print proc_num, "Downloading", name  # downloads the actual compressed file
        success = False
        while not success:
            with open(loc_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    print "Fail!!"
                    continue

        print proc_num, "Unzipping", name  # Unzips and put the datafile into max LINE_SPLIT line long files
        subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d'])
        print proc_num, "Splitting", name
        subprocess.call([
            "split", "-l",
            str(LINE_SPLIT), loc_dir + name, download_dir + "/" + name + "-"
        ])
        os.remove(loc_dir + name)
        os.rmdir(loc_dir)

        # runs gramgrab
        print proc_num, "gram_grab", name
        queue_g = Queue()
        for item in os.listdir(download_dir + "/"):
            if name + "-" in item:
                queue_g.put(item)
        time.sleep(0.01)  # It is needed to eliminate unstability
        main(proc_num, queue_g, out_dir_g, download_dir, context_size)
        for item in os.listdir(download_dir + "/"):
            if name + "-" in item:
                os.remove(download_dir + "/" + item)
def run_parallel(num_processes, out_dir, source):
    page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page
    queue = Queue()
    for url in urls:
        queue.put(url)
    ioutils.mkdir(out_dir + '/' + source + '/raw')
    download_dir = out_dir + '/' + source + '/raw/'
    ioutils.mkdir(download_dir)
    procs = [Process(target=split_main, args=[i, queue, download_dir]) for i in range(num_processes)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
Exemple #11
0
def main(proc_num, queue, out_dir, download_dir, context_size):
    print proc_num, "Start loop"
    while True:  # Iterates through the downloaded ngram files
        if queue.empty():
            break
        name = queue.get()
        loc_dir = out_dir + "/" + name + "/"
        ioutils.mkdir(
            loc_dir
        )  # Creates a directory for each downloaded file where the yearly co-occurrence matrices will be putted

        print proc_num, "Going through", name
        index = collections.OrderedDict(
        )  # index: bijection between words and integers (integers will be the indeces of the co-occurence matrix)
        year_counters = collections.defaultdict(
            collections.Counter
        )  # year_conter: for every year it contains the co-occurrence matrix as a counter. Counter is used as word-pair - occurrence pairs
        # time.sleep(120 * random.random()) # Sometimes it is needed to eliminate unstability
        with open(download_dir + name) as f:
            for i, l in enumerate(
                    f):  # Iterates through the individual ngram file
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                ngram = [
                    indexing.word_to_id(word.split("_")[0], index)
                    for word in split[0].split()
                ]  # Lists the indices of the words in the ngram
                year = split[1]
                count = int(split[2])
                # Modifies the co-occurrence matrix with the new informations
                if context_size == 2:
                    year_counters = update_count(ngram, 2, year, count,
                                                 year_counters)
                elif context_size == 4:
                    year_counters = update_count(ngram, 0, year, count,
                                                 year_counters)
                    year_counters = update_count(ngram, 4, year, count,
                                                 year_counters)
                else:
                    raise Exception("Unsupported context size")

        print proc_num, "Writing", name  # Writes the yearly co-occurrence matrices into .bin files
        #time.sleep(120 * random.random())  # Sometimes it is needed to eliminate unstability
        for year, counter in year_counters.iteritems():
            sparse_io.export_mat_from_dict(counter, loc_dir + year + ".bin")
        ioutils.write_pickle(index, loc_dir + "index.pkl")  # Saves the index
Exemple #12
0
def run_parallel(num_processes, root_dir, source, context_size):
    queue = Queue()
    download_dir = root_dir + '/' + source + '/raw/'
    out_dir = root_dir + '/' + source + '/c' + str(context_size) + '/raw/'
    ioutils.mkdir(out_dir)

    for name in os.listdir(download_dir):
        queue.put(name)
    procs = [
        Process(target=main,
                args=[i, queue, out_dir, download_dir, context_size])
        for i in range(num_processes)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
Exemple #13
0
def run(out_dir, in_dir, years, language):
    index = collections.OrderedDict()
    for year in years:  # Iterates through the year
        print "Merging year", year
        year_list = ioutils.load_pickle(in_dir + str(year) + "-list.pkl")
        i = 0
        for i in xrange(len(year_list)
                        ):  # Iterates through the words in the individual year
            word = year_list[i]
            stop_set = set(stopwords.words(language))
            if word.isalpha() and not word in stop_set:
                indexing.word_to_cached_id(
                    word, index)  # Put every word in the common index

    ioutils.mkdir(out_dir)
    ioutils.write_pickle(index, out_dir + "merged_index.pkl")
    ioutils.write_pickle(list(index), out_dir + "merged_list.pkl")
Exemple #14
0
def run_parallel(num_processes, out_dir, source):
    ioutils.mkdir(out_dir)
    ioutils.mkdir(out_dir + '/' + source)
    ioutils.mkdir(out_dir + '/' + source + '/' + VERSION)
    download_dir = out_dir + '/' + source + '/' + VERSION + '/' + TYPE + '/'
    ioutils.mkdir(download_dir)
    lock = Lock()
    procs = [Process(target=main, args=[i, lock, download_dir, source]) for i in range(num_processes)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
def main(proc_num, queue, out_dir, download_dir, context_size, is_zipped):
    print proc_num, "Start loop"
    while True:
        if queue.empty():
            break
        name = queue.get()

        if is_zipped:
            if not name.endswith((".gz")):
                continue
            print "Unzipping " + name + " ..."
            subprocess.call(['gunzip', '-f', download_dir + name, '-d'])
            name = name.split(".gz")[0]

        loc_dir = out_dir + "/" + name + "/"
        ioutils.mkdir(loc_dir)

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        time.sleep(120 * random.random())
        with open(download_dir + name) as f:
            for i, l in enumerate(f):
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                ngram = [indexing.word_to_id(word.split("_")[0], index) for word in split[0].split()]
                year = split[1]
                count = int(split[2])
                if context_size == 2:
                    year_counters = update_count(ngram, 2, year, count, year_counters)
                elif context_size == 4:
                    year_counters = update_count(ngram, 0, year, count, year_counters)
                    year_counters = update_count(ngram, 4, year, count, year_counters)
                else:
                    raise Exception("Unsupported context size")

        print proc_num, "Writing", name
        time.sleep(120 * random.random())
        sparse_io_ref.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")
        os.remove(download_dir + name)
Exemple #16
0
def main(proc_num, queue, out_dir, in_dir, context_size):
    ioutils.mkdir(out_dir)
    print proc_num, "Start loop"
    while True:  # Iterates through the years
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            break
        print proc_num, "- Loading mat for year", year
        year_mat = load_matrix(in_dir + str(year) + ".bin")
        index = ioutils.load_pickle(in_dir + str(year) + "-index.pkl")
        print proc_num, "- Processing data for year", year
        counts = year_mat.sum(1) / (2 * context_size)  # sums up the occurrence
        counts = {
            word: int(counts[index[word]])
            for word in index if index[word] < len(counts)
        }
        ioutils.write_pickle(counts, out_dir + "/" + str(year) +
                             "-counts.pkl")  # writes it in a file
Exemple #17
0
def run_parallel(num_processes, out_dir, source):
    page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    ioutils.mkdir(out_dir)
    ioutils.mkdir(out_dir + '/' + source)
    ioutils.mkdir(out_dir + '/' + source + '/' + VERSION)
    download_dir = out_dir + '/' + source + '/' + VERSION + '/' + TYPE + '/'
    ioutils.mkdir(download_dir)
    lock = Lock()
    procs = [Process(target=main, args=[i, lock, page, download_dir, source]) for i in range(num_processes)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
Exemple #18
0
def run_parallel(num_processes, out_dir, source):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page
    queue = Queue()
    for url in urls:
        queue.put(url)
    ioutils.mkdir(out_dir + '/' + source + '/raw')
    download_dir = out_dir + '/' + source + '/raw/'
    ioutils.mkdir(download_dir)
    procs = [
        Process(target=split_main, args=[i, queue, download_dir])
        for i in range(num_processes)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
def run_parallel(num_processes, out_dir, source, context_size):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html"
    )  # gets the URL addresses from the code of the webpage of database
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page
    queue = Queue()
    for url in urls:  # puts the urls into a queue
        queue.put(url)
    ioutils.mkdir(out_dir + '/' + source + '/raw')
    download_dir = out_dir + '/' + source + '/raw/'
    out_dir_g = out_dir + '/' + source + '/c' + str(context_size) + '/raw/'
    ioutils.mkdir(download_dir)
    procs = [
        Process(target=split_main,
                args=[i, queue, download_dir, out_dir_g, context_size])
        for i in range(num_processes)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
        row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat)
        old_index = list(ioutils.load_pickle(in_dir + str(year) + "-index.pkl"))
        new_index = collections.OrderedDict()
        for i in xrange(len(keep_rows)):
            new_index[old_index[keep_rows[i]]] = i
        ioutils.write_pickle(new_index, in_dir + "/second/" + str(year) + "-index.pkl")
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year, in_dir + "/second/")

def run_parallel(num_procs, in_dir, years):
    queue = Queue()
    random.shuffle(years)
    for year in years:
        queue.put(year)
    procs = [Process(target=worker, args=[i, queue, in_dir, years]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
 
if __name__ == '__worker__':
    parser = argparse.ArgumentParser(description="Makes and stores second order matrices from first order PPMI data..")
    parser.add_argument("in_dir", help="path to first order data")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    ioutils.mkdir(args.in_dir + "/second")
    run_parallel(args.num_procs, args.in_dir + "/", years) 
Exemple #21
0
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Making knn net for year", year
        old_mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin")
        row_d, col_d, data_d = make_knn_mat(old_mat, k)
        
        print proc_num, "Writing counts for year", year
        matstore.export_mat_eff(row_d, col_d, data_d, year, tmp_pref)

def run_parallel(num_procs, in_dir, years, k):
    lock = Lock()
    procs = [Process(target=main, args=[i, lock, in_dir, years, k]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
 
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.")
    parser.add_argument("in_dir", help="path to unmerged data")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR)
    parser.add_argument("--k", type=int, help="k nn thresh", default=K)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    ioutils.mkdir(args.in_dir + "/dknn-" + str(args.k))
    run_parallel(args.num_procs, args.in_dir + "/", years, args.k) 
Exemple #22
0
def main(in_dir, out_dir, years):
    name_list = []

    print "Start loop"
    for zipped_file in os.listdir(in_dir):
        if not os.path.isfile(in_dir + "/" + zipped_file) or not zipped_file.endswith((".gz")):
            continue

        tmp_year_counts = {}
        tmp_year_doc_counts = {}
        tmp_year_pos = {}
        for year in years:
            tmp_year_counts[year] = {}
            tmp_year_doc_counts[year] = {}
            tmp_year_pos[year] = {}

        name = zipped_file.split(".gz")[0]
        name_list.append(name)
        print  "Unzipping", name
        subprocess.call(['gunzip', '-f', in_dir + "/" + name + '.gz', '-d'])

        print  "Going through", name
        with open(in_dir + "/" + name) as f:
            for l in f:
                try:
                    split = l.strip().split('\t')
                    if not POS.match(split[0]):
                        continue
                    count = int(split[2])
                    if count < 10:
                        continue
                    word_info = split[0].split("_") 
                    pos = word_info[-1]
                    word = word_info[0].decode('utf-8').lower()
                    word = word.strip("\"")
                    word = word.split("\'s")[0]
                    year = int(split[1])
                    doc_count = int(split[3])
                    if not year in years:
                        continue
                    if not word in tmp_year_counts[year]:
                        tmp_year_counts[year][word] = 0
                        tmp_year_doc_counts[year][word] = 0
                        tmp_year_pos[year][word] = collections.Counter() 
                    tmp_year_counts[year][word] += count 
                    tmp_year_doc_counts[year][word] += doc_count 
                    tmp_year_pos[year][word][pos] += count
                except UnicodeDecodeError:
                     pass

        print "Writing tmp " + name
        ioutils.mkdir(out_dir + "/" + name)
        for year in years:
            ioutils.write_pickle(tmp_year_counts[year], out_dir + "/" + name + "/" + str(year) + "-counts.pkl")
            ioutils.write_pickle(tmp_year_doc_counts[year], out_dir + "/" + name + "/" + str(year) +  "-doc_counts.pkl")
            ioutils.write_pickle(tmp_year_pos[year], out_dir + "/" + name + "/" + str(year) +  "-pos.pkl")
        
        print "Deleting", name
        try:
            os.remove(in_dir + "/" + name)
            os.remove(in_dir + "/"+ name + '.gz')
        except:
            pass

    print "Merging..."
    merge_year_counts(out_dir, name_list, years)
Exemple #23
0
def main(proc_num, lock, download_dir, source):
    page = requests.get(
        "http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.csv.zip)' %
                         (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for url in urls:
            name = re.search('%s-(.*).csv.zip' % VERSION, url).group(1)
            dirs = set(os.listdir(download_dir))
            if name in dirs:
                continue

            work_left = True
            print proc_num, "Name", name
            loc_dir = download_dir + "/" + name + "/"
            ioutils.mkdir(loc_dir)
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Downloading", name

        success = False
        while not success:
            with open(loc_dir + name + '.csv.zip', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(
            ['unzip', '-o', loc_dir + name + '.csv.zip', '-d', loc_dir])
        subprocess.call([
            'mv', loc_dir + 'googlebooks-' + source + '-' + TYPE + '-' +
            VERSION + '-' + name + '.csv', loc_dir + name
        ])

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        n = 0
        with open(loc_dir + name) as f:
            for l in f:
                split = l.strip().split('\t')
                try:
                    ngram = split[0].split()
                    middle_index = len(ngram) // 2
                    item = ngram[middle_index]
                    context = ngram[:middle_index] + ngram[middle_index + 1:]
                    item_id = indexing.word_to_id(item, index)
                    year = split[1]
                    count = int(split[2])
                    for context_word in context:
                        pair = (item_id,
                                indexing.word_to_id(context_word, index))
                        year_counters[year][pair] += count
                except:
                    pass

        print proc_num, "Writing", name, n
        matstore.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")

        print proc_num, "Deleting", name
        try:
            os.remove(loc_dir + name)
            os.remove(loc_dir + name + '.csv.zip')
        except:
            pass
Exemple #24
0
        export_mat_from_dict(pair_counts, decade, OUT.format(type=type, window_size=window_size))

def _process_context(context, pair_counts, window_size):
    if len(context) < window_size + 1:
        return pair_counts
    target = context[window_size]
    indices = range(0, window_size)
    indices.extend(range(window_size + 1, 2 * window_size + 1))
    for i in indices:
        if i >= len(context):
            break
        pair_counts[(target, context[i])] += 1
    return pair_counts

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("type")
    parser.add_argument("window_size", type=int)
    parser.add_argument("--workers", type=int, default=25)
    args = parser.parse_args()
    mkdir(OUT.format(type=args.type, window_size=args.window_size))
    queue = Queue()
    for decade in range(1810, 2010, 10):
        queue.put(decade)
    id_map = load_pickle(DICT.format(type=args.type))
    procs = [Process(target=worker, args=[i, queue, args.window_size, args.type, id_map]) for i in range(args.workers)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
Exemple #25
0

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Computes various frequency statistics.")
    parser.add_argument("out_dir")
    parser.add_argument("in_dir")
    parser.add_argument("count_dir")
    parser.add_argument("wordlist")
    parser.add_argument("--num_sam", type=int, default=10000000)
    parser.add_argument("--workers", type=int, default=10)
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=1800)
    parser.add_argument("--end-year",
                        type=int,
                        help="end year (inclusive)",
                        default=2000)
    parser.add_argument("--year-inc",
                        type=int,
                        help="end year (inclusive)",
                        default=1)
    parser.add_argument("--sample", type=float, default=1e-5)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    ioutils.mkdir(args.out_dir)
    wordlist = ioutils.load_pickle(args.wordlist)
    run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/", years,
                 wordlist, args.count_dir, args.num_sam, args.sample)
    years = year_index_infos.keys()
    random.shuffle(years)
    for year in years:
        queue.put(year)
    procs = [Process(target=worker, args=[i, queue, out_pref, in_dir, year_index_infos, knn, thresh]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
    merge(out_pref, years, get_full_word_list(year_index_infos))
 
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes network statistics for second order data.")
    parser.add_argument("dir", help="path to directory with co-occurrence data and index")
    parser.add_argument("word_file", help="path to sorted word file(s).", default=None)
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1900)
    parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=2000)
    parser.add_argument("--thresh", type=float, help="optional threshold", default=0)
    parser.add_argument("--knn", type=int, help="optional number of nearest neighbours", default=None)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    year_index_infos = ioutils.load_year_index_infos(args.dir, years, args.word_file, num_words=args.num_words)
    outpref = args.dir + "/secondnetstats-" + str(args.thresh) + "-" + str(args.knn) + "/" 
    ioutils.mkdir(outpref)
    outpref += args.word_file.split("/")[-1].split(".")[0]
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    run_parallel(args.num_procs, outpref, args.dir + "/", year_index_infos, knn=args.knn, thresh=args.thresh)
Exemple #27
0
def main(proc_num, lock, page, download_dir, source):
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for url in urls:
            if EXCLUDE_PATTERN.match(url):
                continue
            name = re.search('%s-(.*).gz' % VERSION, url).group(1)
            dirs = set(os.listdir(download_dir))
            if name in dirs:
                continue

            work_left = True
            print proc_num, "Name", name
            loc_dir = download_dir + "/" + name + "/"
            ioutils.mkdir(loc_dir)
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Downloading", name

        success = False
        while not success:
            with open(loc_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d'])
#        subprocess.call(['mv', loc_dir + 'googlebooks-' + source + '-' +  TYPE + '-' + VERSION + '-' + name + '.csv', loc_dir + name])

        print proc_num, "Going through", name
        index = collections.OrderedDict()
        year_counters = collections.defaultdict(collections.Counter)
        skipped = 0
        with open(loc_dir + name) as f:
            for l in f:
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                try:
                    ngram = split[0].split()
                    middle_index = len(ngram) // 2
                    item = ngram[middle_index]
                    context = ngram[:middle_index] + ngram[middle_index + 1:]
                    item_id = indexing.word_to_id(item, index)
                    year = split[1]
                    count = int(split[2])
                    for context_word in context:
                        pair = (item_id, indexing.word_to_id(context_word, index))
                        year_counters[year][pair] += count
                except:
                    skipped += 1
                    pass

        print proc_num, "Writing", name, "Skipped", skipped
        matstore.export_mats_from_dicts(year_counters, loc_dir)
        ioutils.write_pickle(index, loc_dir + "index.pkl")

        print proc_num, "Deleting", name
        try:
            os.remove(loc_dir + name)
            os.remove(loc_dir + name + '.gz')
        except:
            pass
Exemple #28
0
    procs = [
        Process(target=worker, args=[i, queue, in_dir, years])
        for i in range(num_procs)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()


if __name__ == '__worker__':
    parser = argparse.ArgumentParser(
        description=
        "Makes and stores second order matrices from first order PPMI data..")
    parser.add_argument("in_dir", help="path to first order data")
    parser.add_argument("num_procs",
                        type=int,
                        help="number of processes to spawn")
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=START_YEAR)
    parser.add_argument("--end-year",
                        type=int,
                        help="start year (inclusive)",
                        default=END_YEAR)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    ioutils.mkdir(args.in_dir + "/second")
    run_parallel(args.num_procs, args.in_dir + "/", years)
                new_row = get_index(merged_index, year_list, mat.row[i])
                new_col = get_index(merged_index, year_list, mat.col[i])
                counts[(new_row, new_col)] += mat.data[i]
            print "Done year ", decade + year
        export_mat_from_dict(counts, decade, out_dir)
        write_pickle(merged_index, out_dir + str(decade) + "-index.pkl")
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")

def run_parallel(num_procs, out_dir, in_dir, decades):
    queue = Queue()
    for decade in decades:
        queue.put(decade)
    procs = [Process(target=worker, args=[i, queue, out_dir, in_dir]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.")
    parser.add_argument("out_dir", help="path to network data (also where output goes)")
    parser.add_argument("in_dir", help="path to network data (also where output goes)")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)")
    parser.add_argument("--end-year", type=int, help="end year (inclusive)")
    args = parser.parse_args()
    decades = range(args.start_year, args.end_year + 1, 10)
    decades.reverse()
    mkdir(args.out_dir)
    run_parallel(args.num_procs, args.out_dir + "/",  args.in_dir + "/", decades)
Exemple #30
0

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("dir")
    parser.add_argument("rep_type")
    parser.add_argument("num_emb",
                        help="number of different embedding-series",
                        type=int)
    parser.add_argument("wordlist")
    parser.add_argument("--start-year", type=int, default=1800)
    parser.add_argument("--end-year", type=int, default=2000)
    parser.add_argument("--year-inc", type=int, default=1)
    parser.add_argument("--dim", type=int, default=300)
    args = parser.parse_args()
    kwargs = dict()
    if (args.rep_type.lower() == "sgns"):
        kwargs["normalize"] = False
    wordlist = load_pickle(args.wordlist)
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    mkdir(args.dir + "/embedding_avg/")
    mkdir(args.dir + "/embedding_avg/aligned")
    for i in range(1, args.num_emb + 1):
        mkdir(args.dir + "/embedding_" + str(i) + "/noinit/" + str(args.dim) +
              "/aligned/")
    for year in years:
        align_cloud(year, args.rep_type, args.dir, args.num_emb, args.dim,
                    wordlist, **kwargs)
    align_years(years, args.rep_type, args.dir, args.num_emb, args.dim,
                **kwargs)
Exemple #31
0
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=1900)
    parser.add_argument("--end-year",
                        type=int,
                        help="start year (inclusive)",
                        default=2000)
    parser.add_argument("--year-inc",
                        type=int,
                        help="year increment",
                        default=1)
    parser.add_argument("--thresh",
                        type=float,
                        help="optional threshold",
                        default=None)
    args = parser.parse_args()
    years = list(range(args.start_year, args.end_year + 1, args.year_inc))
    year_index_infos = ioutils.load_year_index_infos(args.dir,
                                                     years,
                                                     args.word_file,
                                                     num_words=args.num_words)
    outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0]
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    if args.thresh != None:
        outpref += "-" + str(args.thresh)
    ioutils.mkdir(args.dir + "/netstats")
    run_parallel(args.num_procs, args.dir + outpref, args.dir + "/",
                 year_index_infos, args.thresh)
Exemple #32
0
    for p in procs:
        p.join()
    print "Merging"
    full_word_set = set([])
    for year_words in word_list.itervalues():
        full_word_set = full_word_set.union(set(year_words))
    merge(out_pref, years, list(full_word_set))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes semantic change statistics for words.")
    parser.add_argument("dir", help="path to network data (also where output goes)")
    parser.add_argument("word_file", help="path to sorted word file")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include", default=-1)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR)
    parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=END_YEAR)
    parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    word_lists = ioutils.load_year_words(args.word_file, years)
    if args.num_words != -1:
        for year in years:
            word_lists[year] = word_lists[year][:args.num_words]
    ioutils.mkdir(args.dir + "/volstats")
    outpref ="/volstats/" + args.word_file.split("/")[-1].split(".")[0] + "-" + str(args.thresh)
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    displacement_base = simple_create_representation(REP_TYPE, args.dir + "/" +  str(args.disp_year) + ".bin", restricted_context=word_lists[args.end_year], thresh=args.thresh)
    run_parallel(args.num_procs, args.dir + outpref, args.dir + "/", years[1:], word_lists, displacement_base, args.thresh)       
    target = context[window_size]
    indices = range(0, window_size)
    indices.extend(range(window_size + 1, 2 * window_size + 1))
    for i in indices:
        if i >= len(context):
            break
        pair_counts[(target, context[i])] += 1
    return pair_counts


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("type")
    parser.add_argument("window_size", type=int)
    parser.add_argument("--workers", type=int, default=25)
    args = parser.parse_args()
    mkdir(OUT.format(type=args.type, window_size=args.window_size))
    queue = Queue()
    for decade in range(1810, 2010, 10):
        queue.put(decade)
    id_map = load_pickle(DICT.format(type=args.type))
    procs = [
        Process(target=worker,
                args=[i, queue, args.window_size, args.type, id_map])
        for i in range(args.workers)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.")
    parser.add_argument("dir", help="path to directory with count data and index")
    parser.add_argument("word_file", help="path to sorted word file(s).", default=None)
    parser.add_argument("sample_file", help="path to file with sample sizes.", default=None)
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1900)
    parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=2000)
    parser.add_argument("--num-boots", type=int, help="Number of bootstrap samples", default=10)
    parser.add_argument("--smooth", type=int, help="laplace smoothing factor", default=10)
    parser.add_argument("--alpha", type=float, help="confidence threshold for edges", default=0.05)
    parser.add_argument("--fwer-control", action='store_true', help="use Bonferroni")
    parser.add_argument("--id", type=int, help="run id", default=0)
    args = parser.parse_args()
    sample_sizes = ioutils.load_pickle(args.sample_file)
    eff_sample_size = np.percentile(np.array(sample_sizes.values()), 10)
    if args.smooth == 0:
        smooth = 0
    else:
        smooth = 10.0**(-1*float(args.smooth))
    years = range(args.start_year, args.end_year + 1)
    index = ioutils.load_pickle(args.dir + "/index.pkl")
    year_index_infos = ioutils.load_year_index_infos_common(index, years, args.word_file, num_words=args.num_words) 
    outpref = "/bootstats-" + str(args.alpha) + "-" + str(args.fwer_control) + "/" +  args.word_file.split("/")[-1].split(".")[0]
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    ioutils.mkdir(args.dir + "/" + outpref.split("/")[1])
    run_parallel(args.num_procs, args.dir + outpref, args.dir + "/", year_index_infos, args.num_boots, smooth, eff_sample_size, args.alpha, args.fwer_control, args.id)       
Exemple #35
0
def main(proc_num, lock, download_dir, source):
    page = requests.get("http://storage.googleapis.com/books/ngrams/books/datasetsv2.html")
    pattern = re.compile('href=\'(.*%s-%s-%s-.*\.gz)' % (source, TYPE, VERSION))
    urls = pattern.findall(page.text)
    del page

    print proc_num, "Start loop"
    while True:
        lock.acquire()
        work_left = False
        for url in urls:
            name = re.search('%s-(.*).gz' % VERSION, url).group(1)
            dirs = set(os.listdir(download_dir))
            if name in dirs:
                continue

            work_left = True
            print proc_num, "Name", name
            loc_dir = download_dir + "/" + name + "/"
            ioutils.mkdir(loc_dir)
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Downloading", name

        success = False
        while not success:
            with open(loc_dir + name + '.gz', 'w') as f:
                try:
                    f.write(urllib2.urlopen(url, timeout=60).read())
                    success = True
                except:
                    continue

        print proc_num, "Unzipping", name
        subprocess.call(['gunzip', '-f', loc_dir + name + '.gz', '-d'])

        print proc_num, "Going through", name
        year_grams = collections.defaultdict(dict)
        n = 0
        with open(loc_dir + name) as f:
            for l in f:
                l = l.decode('utf-8').lower()
                split = l.strip().split('\t')
                if EXCLUDE_PATTERN.match(split[0]):
                    continue
                try:
                    ngram = split[0].split()
                    middle_index = len(ngram) // 2
                    item = ngram[middle_index]
                    if (not item.isalpha()) or item in STOPWORDS:
                        continue
                    year = split[1]
                    count = int(split[2])
                    if item not in year_grams[year]:
                        year_grams[year][item] = [(l, count)]
                    else:
                        year_grams[year][item].append((l, count))
                except:
                    #print "!", l.strip().split()
                    pass

        print proc_num, "Writing", name, n
        for year in year_grams:
            ioutils.write_pickle(year_grams[year], loc_dir + str(year) + ".pkl")

        print proc_num, "Deleting", name
        try:
            os.remove(loc_dir + name + '.gz')
        except:
            pass
Exemple #36
0
    parser.add_argument("--end-year",
                        type=int,
                        help="start year (inclusive)",
                        default=2000)
    parser.add_argument("--thresh",
                        type=float,
                        help="optional threshold",
                        default=0)
    parser.add_argument("--knn",
                        type=int,
                        help="optional number of nearest neighbours",
                        default=None)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    year_index_infos = ioutils.load_year_index_infos(args.dir,
                                                     years,
                                                     args.word_file,
                                                     num_words=args.num_words)
    outpref = args.dir + "/secondnetstats-" + str(args.thresh) + "-" + str(
        args.knn) + "/"
    ioutils.mkdir(outpref)
    outpref += args.word_file.split("/")[-1].split(".")[0]
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    run_parallel(args.num_procs,
                 outpref,
                 args.dir + "/",
                 year_index_infos,
                 knn=args.knn,
                 thresh=args.thresh)
import ioutils
from cooccurrence.laplaceppmigen import run_parallel

SMOOTH = 10
START_YEAR = 1900
END_YEAR = 2000

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computed laplace smoothed normalized PPMI values.")
    parser.add_argument("out_dir", help="directory where data will be stored")
    parser.add_argument("in_dir", help="path to unmerged data")
    parser.add_argument("word_file", help="file of restricted word set", default=None)
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--conf-dir", help="optional file of restricted word set", default=None)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR)
    parser.add_argument("--num-words", type=int, help="size of vocabulary", default=20000)
    parser.add_argument("--smooth", type=int, help="smoothing factor", default=SMOOTH)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    if args.smooth == 0:
        smooth = 0
    else:
        smooth = 10.0**(-1*float(args.smooth))
    index = ioutils.load_pickle(args.in_dir + "/index.pkl")
    year_index_infos = ioutils.load_year_index_infos_common(index, years, args.word_file, num_words=args.num_words) 
    out_dir = args.out_dir + "/lsmooth" + str(args.smooth) 
    ioutils.mkdir(out_dir)
    run_parallel(args.num_procs,  out_dir + "/", args.in_dir + "/", smooth, year_index_infos, args.conf_dir)       

Exemple #38
0
            if year == 0:
                merged_year_counts = year_counts
            for word, count in year_counts.iteritems():
                if not word in merged_year_counts:
                    merged_year_counts[word] = 0
                merged_year_counts[word] += year_counts[word]

        write_pickle(merged_year_counts, out_dir + str(decade) + "-counts.pkl")

def run_parallel(num_procs, out_dir, in_dir, decades):
    queue = Queue()
    for decade in decades:
        queue.put(decade)
    procs = [Process(target=worker, args=[i, queue, out_dir, in_dir]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merge counts for 1gram data.")
    parser.add_argument("base_dir", help="base directoty. /counts should be a subdir")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)")
    parser.add_argument("--end-year", type=int, help="end year (inclusive)")
    args = parser.parse_args()
    decades = range(args.start_year, args.end_year+1, 10)
    decades.reverse()
    out_dir = args.base_dir + "/decades/counts/"
    mkdir(out_dir)
    run_parallel(args.num_procs, out_dir,  args.base_dir + "/counts/", decades)
Exemple #39
0
    procs = [
        Process(target=main, args=[i, lock, in_dir, years, k])
        for i in range(num_procs)
    ]
    for p in procs:
        p.start()
    for p in procs:
        p.join()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Merges years of raw 5gram data.")
    parser.add_argument("in_dir", help="path to unmerged data")
    parser.add_argument("num_procs",
                        type=int,
                        help="number of processes to spawn")
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=START_YEAR)
    parser.add_argument("--end-year",
                        type=int,
                        help="start year (inclusive)",
                        default=END_YEAR)
    parser.add_argument("--k", type=int, help="k nn thresh", default=K)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    ioutils.mkdir(args.in_dir + "/dknn-" + str(args.k))
    run_parallel(args.num_procs, args.in_dir + "/", years, args.k)
Exemple #40
0
        os.remove(out_dir + str(year) + ".tmp.txt")

def run_parallel(num_procs, out_dir, in_dir, count_dir, years, words, num_words, min_count, sample):
    queue = Queue()
    for year in years:
        queue.put(year)
    procs = [Process(target=worker, args=[i, queue, out_dir, in_dir, count_dir, words, num_words, min_count, sample]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Computes various frequency statistics.")
    parser.add_argument("out_dir")
    parser.add_argument("in_dir")
    parser.add_argument("count_dir")
    parser.add_argument("word_file")
    parser.add_argument("--workers", type=int, default=10)
    parser.add_argument("--num-words", type=int, default=None)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800)
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000)
    parser.add_argument("--year-inc", type=int, help="end year (inclusive)", default=1)
    parser.add_argument("--min-count", type=int, default=100)
    parser.add_argument("--sample", type=float, default=1e-5)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    words = ioutils.load_year_words(args.word_file, years)
    ioutils.mkdir(args.out_dir)
    run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/", args.count_dir + "/", years, words, args.num_words, args.min_count, args.sample)       
                        default=None)
    parser.add_argument("--start-year",
                        type=int,
                        help="start year (inclusive)",
                        default=START_YEAR)
    parser.add_argument("--end-year",
                        type=int,
                        help="start year (inclusive)",
                        default=END_YEAR)
    parser.add_argument("--num-words",
                        type=int,
                        help="size of vocabulary",
                        default=20000)
    parser.add_argument("--smooth",
                        type=int,
                        help="smoothing factor",
                        default=SMOOTH)
    args = parser.parse_args()
    years = range(args.start_year, args.end_year + 1)
    if args.smooth == 0:
        smooth = 0
    else:
        smooth = 10.0**(-1 * float(args.smooth))
    index = ioutils.load_pickle(args.in_dir + "/index.pkl")
    year_index_infos = ioutils.load_year_index_infos_common(
        index, years, args.word_file, num_words=args.num_words)
    out_dir = args.out_dir + "/lsmooth" + str(args.smooth)
    ioutils.mkdir(out_dir)
    run_parallel(args.num_procs, out_dir + "/", args.in_dir + "/", smooth,
                 year_index_infos, args.conf_dir)
Exemple #42
0
    if args.word_file != None:
        if args.index_dir == None:
            print >> sys.stderr, "Must specify index dir with word file!"
            sys.exit()
        word_pickle = ioutils.load_pickle(args.word_file)
        if not args.start_year in word_pickle:
            word_lists = {}
            for year in years:
                word_lists[year] = word_pickle
        else:
            word_lists = word_pickle
        word_infos = {}
        for year, word_list in word_lists.iteritems():
            year_index = ioutils.load_pickle(args.index_dir + "/" + str(year) + "-index.pkl")
            if args.num_words != -1:
                word_list = word_list[: args.num_words]
            word_list, word_indices = get_word_indices(word_list, year_index)
            word_infos[year] = (word_list, word_indices)
        outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0]
        if args.num_words != -1:
            outpref += "-top" + str(args.num_words)
    else:
        word_info = None
        outpref = "/netstats/net"
    if args.thresh != None:
        outpref += "-" + str(args.thresh)
    ioutils.mkdir(args.dir + "/netstats")
    run_parallel(
        args.num_procs, args.dir + outpref, args.dir + "/netstats/", args.dir + "/", years, word_info, args.thresh
    )
Exemple #43
0
                        help="start year (inclusive)",
                        default=START_YEAR)
    parser.add_argument("--end-year",
                        type=int,
                        help="end year (inclusive)",
                        default=END_YEAR)
    parser.add_argument("--thresh",
                        type=float,
                        help="relevance threshold",
                        default=THRESH)
    args = parser.parse_args()
    years = range(args.start_year + 1, args.end_year + 1)
    word_list = ioutils.load_pickle(args.word_file)
    index = ioutils.load_pickle(args.index_file)
    if args.num_words != -1:
        word_list = word_list[:args.num_words]
    ioutils.mkdir(args.dir + "/volstats")
    word_list, word_indices = get_word_indices(word_list, index)
    outpref = "/volstats/" + args.word_file.split("/")[-1].split(
        ".")[0] + "-" + str(args.thresh)
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    displacement_base = matstore.retrieve_mat_as_binary_coo_thresh(
        args.dir + "/" + str(args.end_year) + ".bin",
        args.thresh,
        min_size=MIN_SIZE)
    displacement_base = displacement_base.tocsr()
    run_parallel(args.num_procs, args.dir + outpref,
                 args.dir + outpref + "-tmp", args.dir + "/", years, word_list,
                 word_indices, displacement_base, args.thresh)
Exemple #44
0
                new_row = get_index(merged_index, year_list, mat.row[i])
                new_col = get_index(merged_index, year_list, mat.col[i])
                counts[(new_row, new_col)] += mat.data[i]
            print "Done year ", decade + year
        export_mat_from_dict(counts, decade, out_dir)
        write_pickle(merged_index, out_dir + str(decade) + "-index.pkl")
        write_pickle(list(merged_index), out_dir + str(decade) + "-list.pkl")

def run_parallel(num_procs, out_dir, in_dir, decades):
    queue = Queue()
    for decade in decades:
        queue.put(decade)
    procs = [Process(target=worker, args=[i, queue, out_dir, in_dir]) for i in range(num_procs)]
    for p in procs:
        p.start()
    for p in procs:
        p.join()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.")
    parser.add_argument("out_dir", help="path to network data (also where output goes)")
    parser.add_argument("in_dir", help="path to network data (also where output goes)")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--start-year", type=int, help="start year (inclusive)")
    parser.add_argument("--end-year", type=int, help="end year (inclusive)")
    args = parser.parse_args()
    decades = range(args.start_year, args.end_year + 1, 10)
    decades.reverse()
    mkdir(args.out_dir)
    run_parallel(args.num_procs, args.out_dir + "/",  args.in_dir + "/", decades)       
Exemple #45
0
                    '-c-init-file', out_dir + SAVE_FILE.format(year=years[i-1]) + "-c.bin",
                    '-threads', str(workers), 
                    '-train', in_dir + INPUT_FILE.format(year=year),
                    '-size', str(dim),
                    '-sample', '0',
                    '-negative', '5',
                    '-wvocab', in_dir + VOCAB_FILE.format(year=year),
                    '-cvocab', in_dir + VOCAB_FILE.format(year=year),
                    '-verbose', '2'])

if __name__ == "__main__":
    parser = ArgumentParser("Runs sequential Glove embeddings for years")
    parser.add_argument("in_dir", help="Directory with cooccurrence information and vocab.")
    parser.add_argument("out_dir")
    parser.add_argument("--dim", type=int, default=300)
    parser.add_argument("--workers", type=int, default=50)
    parser.add_argument("--start-year", type=int, default=1800)
    parser.add_argument("--end-year", type=int, default=2000)
    parser.add_argument("--year-inc", type=int, default=1)
    parser.add_argument("--sequential", action="store_true")
    args = parser.parse_args()
    if not args.sequential:
        out_dir = args.out_dir + "/noinit/"
    else:
        out_dir = args.out_dir
    out_dir = out_dir + "/" + str(args.dim) + "/"
    mkdir(out_dir)
    years = range(args.start_year, args.end_year + 1, args.year_inc)
    train_years(years, args.in_dir + "/", out_dir, args.dim, args.workers, args.sequential)