def __init__(self, dataset, mode='by_sent'):
        '''dataset is a sequence list.'''
        self.feature_dict = LabelDictionary()
        self.add_features = False
        self.dataset = dataset
        self.feature_templates = {}
        self.reading_mode = mode

        #Speed up
        self.node_feature_cache = {}
        self.initial_state_feature_cache = {}
        self.final_state_feature_cache = {}
        self.edge_feature_cache = {}

        self.wordcluster_source = os.path.join(path_wordcluster,'output_liang_' + mode +'_5000')
        self.word_clusters = {}

        self.word_reference = uploadObject(os.path.join(path_utils,'word_dict_filtered')) # word dict from training data
        self.word_reference[BR] = 1
        
        self.inner_trigger_words= {'I':{}}
        self.outer_trigger_words= {}

        self.outer_trigger_pos= {}
        self.inner_trigger_pos = {}

        if mode=='by_sent':
            self.FILTER_WORDS = 0.001
            #self.FILTER_WORDS = 0.005
            self.FILTER_POS = 0.001
        else:
            self.FILTER_WORDS = 0.01
            self.FILTER_POS = 0.001
Beispiel #2
0
    def __init__(self, dataset, mode='by_sent'):
        '''dataset is a sequence list.'''
        self.feature_dict = LabelDictionary()
        self.add_features = False
        self.dataset = dataset
        self.feature_templates = {}

        #Speed up
        self.node_feature_cache = {}
        self.initial_state_feature_cache = {}
        self.final_state_feature_cache = {}
        self.edge_feature_cache = {}

        self.wordcluster_source = os.path.join(
            path_wordcluster, 'output_liang_' + mode + '_5000')
        self.word_clusters = {}

        self.word_reference = uploadObject(
            os.path.join(path_utils,
                         'word_dict_filtered'))  # word dict from training data
        self.word_reference[BR] = 1

        self.inner_trigger_words = {'I': {}}
        self.outer_trigger_words = {}

        self.outer_trigger_pos = {}
        self.inner_trigger_pos = {}

        if mode == 'by_sent':
            self.FILTER_WORDS = 0.001
            #self.FILTER_WORDS = 0.005
            self.FILTER_POS = 0.001
        else:
            self.FILTER_WORDS = 0.01
            self.FILTER_POS = 0.001
    def __init__(self, dataset, mode='by_sent'):
        '''dataset is a sequence list.'''
        self.feature_dict = LabelDictionary()
        self.add_features = False
        self.dataset = dataset
        self.feature_templates = {}

        #Speed up
        self.node_feature_cache = {}
        self.initial_state_feature_cache = {}
        self.final_state_feature_cache = {}
        self.edge_feature_cache = {}

        self.wordcluster_source = os.path.join(path_wordcluster,'output_liang_' + mode +'_5000')
        self.word_clusters = {}

        self.word_reference = uploadObject(os.path.join(path_utils,'word_dict_filtered')) # word dict from training data
        self.word_reference[BR] = 1
        
        self.stopwords = getStopwords(stem=True)

        self.inner_trigger_words= set()
def readDoc(doc_file):
	res = []
	try:
		for line in open(doc_file):
			line = line.strip('\n').replace('<br>','')
			if line!='':
				sent = line.split(' ')
				res.append(sent)
	except:
		pass
	return res


if __name__ == '__main__':
	model = uploadObject('sp_5_by_sent')

	ini = date(2014,6,1)

	data = core.find(	{'date':{'$gte': datetime.combine(ini,datetime.min.time())} } ,timeout=False).batch_size(1000)

	count = 0
	for dat in data:
		id = dat['_id']
		job = tokenized.find_one({'$and':[{'_id.description':id['description'] },
								          {'_id.details':id['details'] }
										 ]})
		temp = job.get("carreras",[])
		if temp!=[]:
			print("YALA")
			continue
Beispiel #5
0
data_dir = os.path.join(clustering_project_dir,'enero_ing_stem')

docs_dir = os.path.join(clustering_dir,'jobs_enero/docs')
ing_dir = os.path.join(clustering_project_dir,'jobs_enero/complete_docs')

if not os.path.exists(data_dir):
	os.makedirs(data_dir)


#########################################################################
client = MongoClient()
db = client.JobDB
db_prueba = db.prueba

#########################################################################
word_dict_filtered = uploadObject(os.path.join(utils_path,'word_dict_filtered'))

temp = json.loads(open('hierarchy/ident_names.json','r').read())
hierarchy  = json.loads(open('hierarchy/carreras.json','r').read())

fileByName = {}
for k,v in temp.items():
	fileByName[v]=k
#########################################################################



if __name__ == '__main__':
	
	files_ingenieria = []
	for level1 in hierarchy["children"]:
#########################################################################
client = MongoClient()
db = client.JobDB
db_prueba = db.prueba

#########################################################################
temp = json.loads(open('hierarchy/ident_names.json','r').read())
hierarchy  = json.loads(open('hierarchy/carreras.json','r').read())

fileByName = {}
for k,v in temp.items():
	fileByName[v]=k
#########################################################################
#########################################################################
print("Loading models...")
fun_model = uploadObject(os.path.join(funmodel_path,'prod_model'))
req_model = uploadObject(os.path.join(reqmodel_path,'sp_5_by_sent'))
carr_model = uploadObject(os.path.join(carrmodel_path,'prod_model'))
jobarea_model = uploadObject(os.path.join(jobarea_path,'sp_5_by_doc'))

def parallel_funct(docname):
	print(docname)
	doc = readDoc(os.path.join(source_dir,docname))
	sequence = makeSequence_doc(doc)

	fun_pred,_ = fun_model.viterbi_decode_bigram(sequence)
	fun_pred.sequence_list.seq_list[0].y = fun_pred.y

	jobarea_pred,_ = jobarea_model.viterbi_decode_bigram(sequence)
	jobarea_pred.sequence_list.seq_list[0].y = jobarea_pred.y
Beispiel #7
0
    def __init__(self, data, batch_size=1000, max_vocab_size=None,
                 lower=False):

        self.batch_size = batch_size
        self.dataset = data
        self.lower = lower  # whether to lowercase everything

        self.max_vocab_size = max_vocab_size

        # mapping from cluster IDs to cluster IDs,
        # to keep track of the hierarchy
        self.cluster_parents = {}
        self.cluster_counter = 0

        # the list of words in the vocabulary and their counts
        self.counts = defaultdict(int)
        self.trans = defaultdict(make_int_defaultdict)
        self.num_tokens = 0

        # the graph weights (w) and the effects of merging nodes (L)
        # (see Liang's thesis)
        self.w = defaultdict(make_float_defaultdict)
        self.L = defaultdict(make_float_defaultdict)

        # the 0/1 bit to add when walking up the hierarchy
        # from a word to the top-level cluster
        self.cluster_bits = {}

        # load filtered lexicon
        self.word_reference = uploadObject(os.path.join(path_utils,'word_dict_filtered')) # word dict from all data (200k)
        self.stem_reference = uploadObject(os.path.join(path_utils,'stem_dict'))

        # find the most frequent words
        self.vocab = {}
        self.reverse_vocab = []
        self.create_vocab()

        # create sets of documents that each word appears in
        self.create_index()

        # make a copy of the list of words, as a queue for making new clusters
        word_queue = list(range(len(self.vocab)))

        # score potential clusters, starting with the most frequent words.
        # also, remove the batch from the queue
        self.current_batch = word_queue[:(self.batch_size + 1)]
        word_queue = word_queue[(self.batch_size + 1):]
        self.initialize_tables()

        while len(self.current_batch) > 1:
            # find the best pair of words/clusters to merge
            c1, c2 = self.find_best()

            # merge the clusters in the index
            self.merge(c1, c2)

            if word_queue:
                new_word = word_queue.pop(0)
                self.add_to_batch(new_word)

            logging.info('{} AND {} WERE MERGED INTO {}. {} REMAIN.'
                         .format(self.reverse_vocab[c1] if c1 < len(self.reverse_vocab) else c1,
                                 self.reverse_vocab[c2] if c2 < len(self.reverse_vocab) else c2,
                                 self.cluster_counter,
                                 len(self.current_batch) + len(word_queue) - 1))

            self.cluster_counter += 1
def readDoc(doc_file):
    res = []
    try:
        for line in open(doc_file):
            line = line.strip('\n').replace('<br>', '')
            if line != '':
                sent = line.split(' ')
                res.append(sent)
    except:
        pass
    return res


if __name__ == '__main__':
    model = uploadObject('sp_5_by_sent')

    ini = date(2014, 6, 1)

    data = core.find(
        {
            'date': {
                '$gte': datetime.combine(ini, datetime.min.time())
            }
        },
        timeout=False).batch_size(1000)

    count = 0
    for dat in data:
        id = dat['_id']
        job = tokenized.find_one({