def process(specific_info, data, *args): inputs = get_inputs(data, specific_info) idf = json.loads(open(inputs.get('idf_file')).read()) text = (" ".join([inputs.get(field) for field in inputs.get('fields')])).lower() words_in_document = {} total_words = 0 for word in tokenize(to_unicode(text)): if word in STOPWORDS: continue if not words_in_document.get(word): words_in_document[word] = 0 words_in_document[word] += 1 total_words += 1 tfidf = [] for word, word_count in words_in_document.iteritems(): tfidf.append( ( word, idf.get(word, 0) * (1.0*word_count/total_words) ) ) tfidf = sorted(tfidf, key=lambda tfidf_tuple: -tfidf_tuple[1]) return [{'tfidf': tfidf},]
def process(specific_info, data, *args): inputs = get_inputs(data, specific_info) input_file = inputs.get('input_file') target_file = inputs.get('target_file') fields = inputs.get('fields') first_line = True headers = [] words_in_documents = {} total_documents = 0 for line in open(input_file): line_data = line.rstrip("\n").split("\t") if first_line: headers = line_data first_line = False continue text = (" ".join([line_data[headers.index(field)] for field in fields])).lower() # tokenize(text.decode('utf-8')) for word in set(tokenize(text.decode('utf-8'))): if not words_in_documents.get(word): words_in_documents[word] = 0 words_in_documents[word] += 1 total_documents += 1 for key in words_in_documents.iterkeys(): words_in_documents[key] = math.log(total_documents/(1.0*words_in_documents[key])) for stopword in STOPWORDS: words_in_documents[stopword] = 0 with open(target_file, 'w') as _file: _file.write(to_json(words_in_documents).encode('utf-8')) return [{'idf_file': target_file},]
def process(specific_info, data, *args): inputs = poliglo.get_inputs(data, specific_info) fields = inputs.get('fields', []) text = (" ".join([inputs.get(field) or "" for field in fields])).lower() annotated_data = get_annotations_for_text(text) return [{'bioportal_annotated': annotated_data},]
def process(specific_info, data, *args): inputs = poliglo.get_inputs(data, specific_info) connection = args[0].get('connection') waiting_queue_name = get_waiting_queue_name( data['process']['id'], data['process']['worker_id'], inputs['wait_jobs_from'] ) connection.zadd(waiting_queue_name, time(), to_json(data)) return []
def process(specific_info, data, *args): inputs = get_inputs(data, specific_info) tfidf_worker_id = inputs.get('tfidf_worker_id') queue = inputs.get('__read_from_queue') connection = args[0].get('connection') all_words = {} all_data = [] #for bioportal if queue: queue_values = connection.zrange(queue, 0, -1) for queue_raw_data in queue_values: queue_data = json_loads(queue_raw_data) all_data.append(queue_data) #for bioportal tfidf = select_dict_el(queue_data, 'workers_output.%s.tfidf' % tfidf_worker_id) for word, value in tfidf: if not all_words.get(word): all_words[word] = [] all_words[word].append(value) max_apperance = max([len(values) for (word, values) in all_words.iteritems()])/5 tfidf_results = [(word, 1.0*sum(values)/len(values)*0.65 + 0.35*min(len(values)/max_apperance, 1)) for (word, values) in all_words.iteritems()] tfidf_results.sort(key=lambda tup: -tup[1]) # Bioportal bioportal_worker_id = inputs.get('bioportal_worker_id') bioportal_mesh_names_url = inputs.get('bioporta_mesh_names_url') mesh_names = json_loads(requests.get(bioportal_mesh_names_url).content) bioportal_merged = {} for queue_data in all_data: bioportal_annotated = select_dict_el(queue_data, 'workers_output.%s.bioportal_annotated' % bioportal_worker_id) for mesh_data in bioportal_annotated.get('data'): ontology_id = mesh_data.get('ontology_quote_id') if not bioportal_merged.get(ontology_id): if not mesh_names.get(ontology_id): continue bioportal_merged[ontology_id] = { 'ontology_quote_id': ontology_id, 'matched_terms': [], 'total_frequency': 0, 'included_in_documents': 0, 'name': mesh_names.get(ontology_id) } bioportal_merged[ontology_id]['total_frequency'] += mesh_data.get('frequency') bioportal_merged[ontology_id]['included_in_documents'] += 1 bioportal_merged[ontology_id]['matched_terms'] = list(set(mesh_data.get('matched_terms')+bioportal_merged[ontology_id]['matched_terms'])) to_return_bioportal = sorted( bioportal_merged.values(), key=lambda k: k['included_in_documents'], reverse=True ) return [{'group_tfidf': tfidf_results, 'bioportal_merged': to_return_bioportal},]
def process(specific_info, data, *args): inputs = get_inputs(data, specific_info) mongo_connection = args[0].get('mongo_connection') data_filter = inputs.get('data_filter', {}) fields = inputs.get('fields', []) names = inputs.get('names', []) collection = inputs.get('collection', []) target_file = inputs.get('target_file') data_selector = dict([(field, 1) for field in fields]) matrix_id = inputs.get('matrix_id') if matrix_id: matrix_documents = set() matrix = mongo_connection.matrix.find_one({'_id': ObjectId(inputs.get('matrix_id'))}) matrix_documents |= set([ matrix_el.get('id') for matrix_el in select_dict_el(matrix, 'matrix_dict.matrix', []) ]) matrix_documents |= set([ matrix_el.get('id') for matrix_el in select_dict_el(matrix, 'matrix_dict.studies_order', []) ]) data_filter.update({"id": {"$in": list(matrix_documents)}}) if target_file: target_file = open(target_file, 'w') target_file.write("\t".join(names)+"\n") for episte_data in mongo_connection[collection].find(data_filter, data_selector): if target_file: text = u"\t".join([ to_unicode( select_dict_el(episte_data, field) or '' ).replace('\r\n', ' ').replace('\n', '').replace('\t', ' ') for field in fields ]).encode('utf-8') target_file.write(text+"\n") else: yield dict( [ (names[i], (select_dict_el(episte_data, field) or '')) for i, field in enumerate(fields) ] ) if target_file: yield {'episte_data_target_file': inputs.get('target_file')}
def process(specific_info, data, *args): inputs = get_inputs(data, specific_info) document_id = inputs.get('document_id') to_find_phrases = inputs.get('to_find_phrases', {}) where_to_find = inputs.get('where_to_find', []) document_text = (" ".join([(inputs.get(where) or '') for where in where_to_find])).lower() to_return = {'doc_id': document_id, 'phrases': {}} found_all = True for phrase_group, phrases in to_find_phrases.iteritems(): to_return['phrases'][phrase_group] = False for phrase in phrases: if document_text.find(phrase.lower()) >= 0: to_return['phrases'][phrase_group] = True found_all = found_all and to_return['phrases'][phrase_group] to_return['all_phrases'] = found_all return [to_return]
def process(specific_info, data, *args): inputs = get_inputs(data, specific_info) queue = inputs.get('__read_from_queue') connection = args[0].get('connection') to_return_data = { 'phrases': { 'matching': {}, 'not_matching': {} }, 'all_phrases': { 'matching': [], 'not_matching': [] } } queue_values = connection.zrange(queue, 0, -1) for queue_raw_data in queue_values: queue_data = json_loads(queue_raw_data).get('inputs') match_all = True for phrase_group, phrase_value in queue_data.get('phrases', {}).iteritems(): if phrase_value: target = 'matching' else: target = 'not_matching' if not to_return_data['phrases'][target].get(phrase_group): to_return_data['phrases'][target][phrase_group] = [] if queue_data['doc_id'] not in to_return_data['phrases'][target][phrase_group]: to_return_data['phrases'][target][phrase_group].append(queue_data['doc_id']) match_all = match_all and phrase_value target = 'not_matching' if match_all: target = 'matching' if queue_data['doc_id'] not in to_return_data['all_phrases'][target]: to_return_data['all_phrases'][target].append(queue_data['doc_id']) return [to_return_data,]