def joint_probabilities(config=None,
                        outfile=None,
                        infile=None,
                        indictfile=None):
    t0 = time.time()
    # read configuration file
    conf = configparser.ConfigParser()
    conf.read_file(open(config))

    data_dir = conf.get('main', 'data_dir')
    log_file_name = '003_joint_probabilities.log'
    log_file_path = os.path.join(data_dir, log_file_name)

    def log(msg):
        s = json.dumps(msg)
        print(s)
        f = open(log_file_path, "a")
        f.write(s)
        f.write("\n")
        f.close()

    # ===============================================================
    # file names
    if infile and os.path.isfile(infile):
        file_path_input = infile
    else:
        file_path_input = f'{data_dir}/001_tokenizer_output.jsonl'
    log(('input', file_path_input))

    if outfile:
        file_path_output = outfile
    else:
        file_path_output = f'{data_dir}/003_joint_probabilities.npy'
    log(('output', file_path_output))

    if indictfile and os.path.isfile(indictfile):
        file_path_dict = indictfile
    else:
        file_path_dict = f'{data_dir}/001_tokenizer_dict.jsonl'
    log(('dictfile', file_path_dict))

    # /file names
    # ===============================================================
    topic_model = tm.Model(data_dir)

    with jsonlines.open(file_path_dict) as reader:
        topic_model.set_word_dictionary({row[0]: row[1] for row in reader})

    cooccurrence_probability = topic_model.coccurences(file_path_input,
                                                       lambda it: it['tokens'])
    numpy.save(file_path_output, cooccurrence_probability)
    t1 = time.time()
    log("finished")
    log((
        "time",
        t1 - t0,
    ))
    process = psutil.Process(os.getpid())
    log(('used RAM(bytes)=', process.memory_info().rss))  # in bytes
Esempio n. 2
0
def stopwords(config=None,
              outfile=None,
              infile=None,
              dictfile=None,
              reduceddictfile=None):
    t0 = time.time()
    # read configuration file
    conf = configparser.ConfigParser()
    conf.read_file(open(config))

    data_dir = conf.get('main', 'data_dir')
    log_file_name = '004_stopwords.log'
    log_file_path = os.path.join(data_dir, log_file_name)

    def log(msg):
        s = json.dumps(msg)
        print(s)
        f = open(log_file_path, "a")
        f.write(s)
        f.write("\n")
        f.close()

    # ===============================================================
    # file names
    if infile and os.path.isfile(infile):
        file_path_input = infile
    else:
        file_path_input = f'{data_dir}/003_joint_probabilities.npy'
    log(('input', file_path_input))

    if outfile:
        file_path_output = outfile
    else:
        file_path_output = f'{data_dir}/004_stopwords_output.jsonl'
    log(('output', file_path_output))

    if dictfile and os.path.isfile(dictfile):
        file_path_dict = dictfile
    else:
        file_path_dict = f'{data_dir}/002_rarewords_reduceddict.jsonl'
    log(('dictfile', file_path_dict))

    if reduceddictfile:
        file_path_reduced_dict = reduceddictfile
    else:
        file_path_reduced_dict = f'{data_dir}/004_stopwords_reduceddict.jsonl'
    log(('reduceddictfile', file_path_reduced_dict))
    # /file names
    # ===============================================================
    topic_model = tm.Model(data_dir)

    with jsonlines.open(file_path_dict) as reader:
        topic_model.set_word_dictionary({row[0]: row[1] for row in reader})

    Hmax = conf.getfloat('main', 'Hmax')

    cooccurrence_probability = numpy.load(file_path_input)

    stopwords_dict = topic_model.stopwords(cooccurrence_probability, Hmax)
    log(("stopwords", stopwords_dict))
    with jsonlines.open(file_path_output, mode='w') as writer:
        for k in stopwords_dict:
            writer.write([k, stopwords_dict[k]])

    # exclude rare words from word_dictionary
    reduced_word_dictionary = topic_model.reduced_dictionary(
        topic_model.word_dictionary, stopwords_dict, {})
    with jsonlines.open(file_path_reduced_dict, mode='w') as writer:
        for k in reduced_word_dictionary:
            writer.write([k, reduced_word_dictionary[k]])
    t1 = time.time()
    log("finished")
    log((
        "time",
        t1 - t0,
    ))
    process = psutil.Process(os.getpid())
    log(('used RAM(bytes)=', process.memory_info().rss))  # in bytes
Esempio n. 3
0
def rare_words(config=None,
               outfile=None,
               infile=None,
               dictfile=None,
               reduceddictfile=None):
    t0 = time.time()
    # read configuration file
    conf = configparser.ConfigParser()
    conf.read_file(open(config))

    data_dir = conf.get('main', 'data_dir')
    log_file_name = '002_rarewords.log'
    log_file_path = os.path.join(data_dir, log_file_name)

    def log(msg):
        s = json.dumps(msg)
        print(s)
        f = open(log_file_path, "a")
        f.write(s)
        f.write("\n")
        f.close()

    alpha = conf.getfloat('main', 'alpha')

    # ===============================================================
    # file names
    if infile and os.path.isfile(infile):
        file_path_input = infile
    else:
        file_path_input = f'{data_dir}/001_tokenizer_output.jsonl'
    log(('input', file_path_input))

    if outfile:
        file_path_output = outfile
    else:
        file_path_output = f'{data_dir}/002_rarewords_output.jsonl'
    log(('output', file_path_output))

    if dictfile and os.path.isfile(dictfile):
        file_path_dict = dictfile
    else:
        file_path_dict = f'{data_dir}/001_tokenizer_dict.jsonl'
    log(('dictfile', file_path_dict))

    if reduceddictfile:
        file_path_reduced_dict = reduceddictfile
    else:
        file_path_reduced_dict = f'{data_dir}/002_rarewords_reduceddict.jsonl'
    log(('reduceddictfile', file_path_reduced_dict))
    # /file names
    # ===============================================================

    topic_model = tm.Model(data_dir)

    word_dictionary = dict()
    with jsonlines.open(file_path_dict) as reader:
        for row in reader:
            word_dictionary[row[0]] = row[1]

    topic_model.set_word_dictionary(word_dictionary)

    rare_words_dict = topic_model.rare_words_memory_optimal(
        file_path_input, alpha, lambda it: it['tokens'])

    with jsonlines.open(file_path_output, mode='w') as writer:
        for k in rare_words_dict:
            writer.write([k, rare_words_dict[k]])

    # exclude rare words from word_dictionary
    reduced_word_dictionary = topic_model.reduced_dictionary(
        word_dictionary, {}, rare_words_dict)
    with jsonlines.open(file_path_reduced_dict, mode='w') as writer:
        for k in reduced_word_dictionary:
            writer.write([k, reduced_word_dictionary[k]])

    log(("Dictionary size", len(word_dictionary), " => ",
         len(reduced_word_dictionary)))
    t1 = time.time()
    log("finished")
    log((
        "time",
        t1 - t0,
    ))
    process = psutil.Process(os.getpid())
    log(('used RAM(bytes)=', process.memory_info().rss))  # in bytes
Esempio n. 4
0
def snowball(config=None,
             outfile=None,
             infile=None,
             inptmfile=None,
             incooccurrencefile=None,
             indictfile=None):
    t0 = time.time()

    # read configuration file
    conf = configparser.ConfigParser()
    conf.read_file(open(config))

    data_dir = conf.get('main', 'data_dir')
    log_file_name = '007_restricted_snowball.log'
    log_file_path = os.path.join(data_dir, log_file_name)

    def log(msg):
        s = json.dumps(msg)
        print(s)
        f = open(log_file_path, "a")
        f.write(s)
        f.write("\n")
        f.close()

    # =========

    rest_endpoint = json.loads(conf.get('msacademic', 'restEndpoint'))
    subscription_key = conf.get('msacademic', 'subscriptionKey')
    include_topics = json.loads(
        conf.get('msacademic', 'msAcademicIncludeTopicsIds'))
    exclude_topics = json.loads(
        conf.get('msacademic', 'msAcademicExcludeTopicsIds'))
    max_distance = conf.getfloat('main', 'maxDistance')

    measure = conf.get('main', 'measure')
    measure_types = {
        'kl': measures.kl_divergence,
        'skl': measures.skl_divergence,
        'js': measures.js_divergence,
        'hell': measures.hellinger_distance
    }
    if measure in measure_types:
        difference = measure_types[measure]
    else:
        log(('undefines measure ', measure, 'available types are ',
             measure_types))
        exit()

    api = Api(subscription_key, rest_endpoint, include_topics)

    # =====================================================
    # load initial ids to queue
    file_path_queued_ids = f'{data_dir}/007_restricted_snowball_queued_ids.csv'  # queued items
    file_path_seed_ids = f'{data_dir}/in-seed.csv'  # seed item ids
    log(('infile', infile))
    if infile and infile == 'resume' and os.path.isfile(file_path_queued_ids):
        file_path_initial_queued_ids = file_path_queued_ids
    elif infile and os.path.isfile(infile):
        if os.path.isfile(file_path_queued_ids):
            os.remove(file_path_queued_ids)
        file_path_initial_queued_ids = infile
    else:
        if os.path.isfile(file_path_queued_ids):
            os.remove(file_path_queued_ids)
        file_path_initial_queued_ids = file_path_seed_ids
    log(('file_path_initial_queued_ids', file_path_initial_queued_ids))

    queued_ids_set = set()
    queued_ids = queue.Queue()
    with open(file_path_initial_queued_ids, newline='') as csvfile:
        queue_reader = csv.reader(csvfile, delimiter="\t", quotechar='"')
        for row in queue_reader:
            item_id = str(row[0])
            if item_id not in queued_ids_set:
                queued_ids_set.add(item_id)
                queued_ids.put(item_id)
    # =====================================================

    # =====================================================
    # load known ids
    file_path_known_ids = f'{data_dir}/007_restricted_snowball_known_ids.csv'  # items that were downloaded
    known_ids = set()
    if infile and infile == 'resume' and os.path.isfile(file_path_known_ids):
        with open(file_path_known_ids, newline='') as csvfile:
            queue_reader = csv.reader(csvfile, delimiter="\t", quotechar='"')
            for row in queue_reader:
                known_ids.add(str(row[0]))
    elif os.path.isfile(file_path_known_ids):
        os.remove(file_path_known_ids)
    # /load known ids
    # =====================================================

    # =====================================================
    # load done ids
    file_path_done_ids = f'{data_dir}/007_restricted_snowball_done_ids.csv'  # items that were in the queue
    done_ids = set()
    if infile and infile == 'resume' and os.path.isfile(file_path_done_ids):
        with open(file_path_done_ids, newline='') as csvfile:
            queue_reader = csv.reader(csvfile, delimiter="\t", quotechar='"')
            for row in queue_reader:
                item_id = str(row[0])
                # queued_ids_set.add(item_id) ????
                done_ids.add(item_id)
    elif os.path.isfile(file_path_done_ids):
        os.remove(file_path_done_ids)
    # /load done ids
    # =====================================================

    # =====================================================
    # dictionary
    if indictfile and os.path.isfile(indictfile):
        file_path_dict = indictfile
    else:
        file_path_dict = f'{data_dir}/001_tokenizer_dict.jsonl'
    log(('indictfile', file_path_dict))
    with jsonlines.open(file_path_dict) as reader:
        word_dictionary = {row[0]: row[1] for row in reader}
    # /dictionary
    # =====================================================

    # =====================================================
    # cooccurrence
    if incooccurrencefile and os.path.isfile(incooccurrencefile):
        file_path_cooccurrence = incooccurrencefile
    else:
        file_path_cooccurrence = f'{data_dir}/005_reduced_joint_probabilities.npy'
    log(('incooccurrencefile', file_path_cooccurrence))
    j_prob_reduced = numpy.load(file_path_cooccurrence)
    # /cooccurrence
    # =====================================================

    # =====================================================
    # PTM
    if inptmfile and os.path.isfile(inptmfile):
        file_path_ptm = inptmfile
    else:
        file_path_ptm = f'{data_dir}/006_ptm_output.npy'
    log(('inptmfile', file_path_ptm))
    ptm_data = numpy.load(file_path_ptm, allow_pickle=True)
    ptm_data = ptm_data.item()
    ptm = tm.Model(data_dir)
    ptm.set_word_dictionary(word_dictionary)
    ptm.load_topic_model(j_prob_reduced, ptm_data)
    # /PTM
    # =====================================================

    # =====================================================
    # place to store downloaded and selected item
    if outfile:
        file_path_output = outfile
    else:
        file_path_output = f'{data_dir}/007_restricted_snowball_output.jsonl'

    if not (infile
            and infile == 'resume') and os.path.isfile(file_path_output):
        os.remove(file_path_output)

    log(('output', file_path_output))
    # =====================================================

    # =====================================================
    # load seeds
    log(('seed_ids_file', file_path_seed_ids))
    seed_ids = set()
    with open(file_path_seed_ids, newline='') as csvfile:
        queue_reader = csv.reader(csvfile, delimiter="\t", quotechar='"')
        for row in queue_reader:
            seed_ids.add(str(row[0]))

    n_accepted_ids = 0
    seed_items = dict()
    if os.path.isfile(file_path_output):
        with jsonlines.open(file_path_output) as reader:
            for item in reader:
                n_accepted_ids += 1
                iten_id = str(item['id'])
                if iten_id in seed_ids:
                    seed_items[iten_id] = item
    log(('seed_ids', [x for x in seed_items]))
    # =====================================================

    # =====================================================
    # init NLP tools
    ct = nlp.CustomTokenizer()
    ct.stemmer = PorterStemmer()
    ct.valid_pos_tags = {
        'NNP': 1,
        'JJ': 1,
        'NN': 1,
        'NNS': 1,
        'JJS': 1,
        'JJR': 1,
        'NNPS': 1
    }
    ct.tester = re.compile('^[a-zA-Z]+$')
    ct.stop = set(stopwords.words('english'))
    ct.word_dictionary = word_dictionary
    # /init NLP tools
    # =====================================================

    # =====================================================
    # snowball loop
    batch_size = int(conf.get('main', 'batch_size'))
    save_period = int(conf.get('main', 'save_period'))
    cnt = 0
    api_call_counter = 0
    while True:
        json_batch = []
        next_batch_ids = []
        if len(seed_items) == 0:
            next_batch_ids.extend([x for x in seed_ids if x not in seed_items])
            log(('seed_ids=>next_batch_ids', next_batch_ids))
        else:
            try:
                while len(next_batch_ids) < batch_size:
                    next_id = queued_ids.get_nowait()
                    if next_id not in done_ids:
                        next_batch_ids.append(next_id)
            except:
                pass
            log(('next_batch_ids', next_batch_ids))

        if len(next_batch_ids) == 0:
            break

        done_ids.update(next_batch_ids)

        items = api.load_by_ids(next_batch_ids)
        items.extend(api.load_by_rids(next_batch_ids))
        api_call_counter += 2
        log(('api_call_counter', api_call_counter, 'queue_size',
             queued_ids.qsize(), 'items', len(items)))
        n_known_items = 0
        for item in items:
            entry_id = str(item['id'])
            if entry_id in known_ids:
                n_known_items += 1
                continue
            """
            the item was not seen before
            """
            known_ids.add(entry_id)

            # -----------------------------------------------------------------
            # get tokens
            item['tokens'] = []
            if "topics" in item and item["topics"]:
                item['tokens'].extend([it['name'] for it in item["topics"]])
            item['tokens'].extend(
                ct.exclude_unknown_tokens(
                    ct.get_tokens(
                        str(item['title']) + ". " + str(item['abstract']))))
            # /get tokens
            # -----------------------------------------------------------------

            # -----------------------------------------------------------------
            # apply PTM
            item['ptm'] = list(ptm.topics_from_doc(item['tokens']))
            # -----------------------------------------------------------------

            # -----------------------------------------------------------------
            # distance_to_seed
            if entry_id in seed_ids:
                item['distance_to_seed'] = 0
            else:
                item['distance_to_seed'] = min([
                    difference(item['ptm'], seed_items[seed_item_id]['ptm'])
                    for seed_item_id in seed_items
                ])
            # /distance_to_seed
            # -----------------------------------------------------------------

            # -----------------------------------------------------------------
            # save seed to separate dictionary
            if entry_id in seed_ids:
                seed_items[entry_id] = item
            # -----------------------------------------------------------------

            item_is_valid = True
            for t in item['topics']:
                if t['id'] in exclude_topics:
                    item_is_valid = False

            if item_is_valid and item['distance_to_seed'] > max_distance:
                item_is_valid = False

            if item_is_valid:

                n_accepted_ids += 1
                json_batch.append(item)

                # -------------------------------------------
                # extend queue
                if entry_id not in done_ids and entry_id not in queued_ids_set:
                    queued_ids.put(entry_id)

                for related_entry_id in item['references_to']:
                    if related_entry_id not in done_ids and related_entry_id not in queued_ids_set:
                        queued_ids_set.add(related_entry_id)
                        queued_ids.put(related_entry_id)

                for related_entry_id in item['referenced_by']:
                    if related_entry_id not in done_ids and related_entry_id not in queued_ids_set:
                        queued_ids_set.add(related_entry_id)
                        queued_ids.put(related_entry_id)
                # /extend queue
                # -------------------------------------------
                msg = ("+++++accepted", n_accepted_ids, 'of', len(known_ids),
                       "id", item['id'], "dist", item['distance_to_seed'],
                       "ECC=", item['ecc'], "year", item['year'], "title",
                       item['title'])
            else:
                msg = ("-----rejected", n_accepted_ids, 'of', len(known_ids),
                       "id", item['id'], "dist", item['distance_to_seed'],
                       "ECC=", item['ecc'], "year", item['year'], "title",
                       item['title'])
            log(msg)

        log(('n_known_items', n_known_items))
        with jsonlines.open(file_path_output, mode='a') as writer:
            for item in json_batch:
                log(('id', item['id'], 'year', item['year'], 'title',
                     item['title']))
                writer.write(item)

        if cnt >= save_period or n_known_items == len(items):
            """
                save current state
            """
            cnt = 0
            with open(file_path_queued_ids, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile,
                                    delimiter="\t",
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)

                queued_ids_old = queued_ids
                queued_ids = queue.Queue()
                try:
                    while True:
                        entry_id = queued_ids_old.get_nowait()
                        writer.writerow([entry_id])
                        queued_ids.put(entry_id)
                except:
                    pass

            with open(file_path_done_ids, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile,
                                    delimiter="\t",
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                for entry_id in done_ids:
                    writer.writerow([entry_id])

            with open(file_path_known_ids, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile,
                                    delimiter="\t",
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                for entry_id in known_ids:
                    writer.writerow([entry_id])

            if len(done_ids) >= 20000 or n_known_items == len(items):
                break

        cnt += 1
    # /snowball loop
    # =====================================================

    t1 = time.time()
    log("finished")
    log((
        "time",
        t1 - t0,
    ))
    process = psutil.Process(os.getpid())
    log(('used RAM(bytes)=', process.memory_info().rss))  # in bytes
Esempio n. 5
0
def do_ssnmf(config=None, outfile=None, infile=None, outptmfile=None):
    t0 = time.time()
    # read configuration file
    conf = configparser.ConfigParser()
    conf.read_file(open(config))

    data_dir = conf.get('main', 'data_dir')
    log_file_name = '006_SSNMF.log'
    log_file_path = os.path.join(data_dir, log_file_name)

    def log(msg):
        s = json.dumps(msg)
        print(s)
        f = open(log_file_path, "a")
        f.write(s)
        f.write("\n")
        f.close()

    # ===============================================================
    # file names
    if infile and os.path.isfile(infile):
        file_path_input = infile
    else:
        file_path_input = f'{data_dir}/005_reduced_joint_probabilities.npy'
    log(('input', file_path_input))

    if outfile:
        file_path_output = outfile
    else:
        file_path_output = f'{data_dir}/006_ssnmf_output.npy'
    log(('output', file_path_output))

    if outptmfile:
        file_path_ptm_output = outptmfile
    else:
        file_path_ptm_output = f'{data_dir}/006_ptm_output.npy'
    log(('outptmfile', file_path_ptm_output))

    # /file names
    # ===============================================================

    # read configuration file
    p_max = conf.getint('main', 'Pmax')
    lam = conf.getfloat('main', 'lambda')

    j_prob_reduced = numpy.load(file_path_input)
    log(("len(j_prob_reduced)=", len(j_prob_reduced)))

    topic_model = tm.Model(data_dir)

    params = {
        'maxIterations': 50,
        'lambda': lam,
        'eta': 0.1,
        'beta': 0.99,
        'beta2': 1.000,
        'maxError': 1e-7,
    }

    for i in range(0, 20):

        h = snmf.gradient_descent(j_prob_reduced, p_max, params)
        params['H'] = h

        # h = snmf.sparse_gradient_descent(j_prob_reduced, p_max, params)
        # params['H'] = h

        # h = snmf.sparse_multiplicative(j_prob_reduced, p_max, params)
        # params['H'] = h

        # h = snmf.gradient_descent(j_prob_reduced, p_max, params)
        # params['H'] = h

        numpy.save(file_path_output, h)

        TM = topic_model.model_from_factor(h)
        numpy.save(file_path_ptm_output, TM)
    t1 = time.time()
    log("finished")
    log(("time", t1 - t0,))
    process = psutil.Process(os.getpid())
    log(('used RAM(bytes)=', process.memory_info().rss))  # in bytes