Example #1
0
def process(specific_info, data, *args):
    inputs = get_inputs(data, specific_info)

    idf = json.loads(open(inputs.get('idf_file')).read())

    text = (" ".join([inputs.get(field) for field in inputs.get('fields')])).lower()

    words_in_document = {}
    total_words = 0
    for word in tokenize(to_unicode(text)):
        if word in STOPWORDS:
            continue
        if not words_in_document.get(word):
            words_in_document[word] = 0
        words_in_document[word] += 1
        total_words += 1

    tfidf = []
    for word, word_count in words_in_document.iteritems():
        tfidf.append(
            (
                word,
                idf.get(word, 0) * (1.0*word_count/total_words)
            )
        )
    tfidf = sorted(tfidf, key=lambda tfidf_tuple: -tfidf_tuple[1])

    return [{'tfidf': tfidf},]
Example #2
0
def process(specific_info, data, *args):
    inputs = get_inputs(data, specific_info)

    input_file = inputs.get('input_file')
    target_file = inputs.get('target_file')
    fields = inputs.get('fields')
    first_line = True
    headers = []
    words_in_documents = {}
    total_documents = 0
    for line in open(input_file):
        line_data = line.rstrip("\n").split("\t")
        if first_line:
            headers = line_data
            first_line = False
            continue
        text = (" ".join([line_data[headers.index(field)] for field in fields])).lower()
        # tokenize(text.decode('utf-8'))
        for word in set(tokenize(text.decode('utf-8'))):
            if not words_in_documents.get(word):
                words_in_documents[word] = 0
            words_in_documents[word] += 1
        total_documents += 1

    for key in words_in_documents.iterkeys():
        words_in_documents[key] = math.log(total_documents/(1.0*words_in_documents[key]))

    for stopword in STOPWORDS:
        words_in_documents[stopword] = 0


    with open(target_file, 'w') as _file:
        _file.write(to_json(words_in_documents).encode('utf-8'))

    return [{'idf_file': target_file},]
def process(specific_info, data, *args):
    inputs = poliglo.get_inputs(data, specific_info)

    fields = inputs.get('fields', [])
    text = (" ".join([inputs.get(field) or "" for field in fields])).lower()

    annotated_data = get_annotations_for_text(text)
    return [{'bioportal_annotated': annotated_data},]
def process(specific_info, data, *args):
    inputs = poliglo.get_inputs(data, specific_info)
    connection = args[0].get('connection')

    waiting_queue_name = get_waiting_queue_name(
        data['process']['id'], data['process']['worker_id'], inputs['wait_jobs_from']
    )
    connection.zadd(waiting_queue_name, time(), to_json(data))
    return []
def process(specific_info, data, *args):
    inputs = get_inputs(data, specific_info)

    tfidf_worker_id = inputs.get('tfidf_worker_id')
    queue = inputs.get('__read_from_queue')
    connection = args[0].get('connection')
    all_words = {}

    all_data = [] #for bioportal

    if queue:
        queue_values = connection.zrange(queue, 0, -1)
        for queue_raw_data in queue_values:
            queue_data = json_loads(queue_raw_data)
            all_data.append(queue_data) #for bioportal

            tfidf = select_dict_el(queue_data, 'workers_output.%s.tfidf' % tfidf_worker_id)
            for word, value in tfidf:
                if not all_words.get(word):
                    all_words[word] = []
                all_words[word].append(value)

    max_apperance = max([len(values) for (word, values) in all_words.iteritems()])/5
    tfidf_results = [(word, 1.0*sum(values)/len(values)*0.65 + 0.35*min(len(values)/max_apperance, 1)) for (word, values) in all_words.iteritems()]
    tfidf_results.sort(key=lambda tup: -tup[1])

    # Bioportal
    bioportal_worker_id = inputs.get('bioportal_worker_id')
    bioportal_mesh_names_url = inputs.get('bioporta_mesh_names_url')
    mesh_names = json_loads(requests.get(bioportal_mesh_names_url).content)
    bioportal_merged = {}
    for queue_data in all_data:
        bioportal_annotated = select_dict_el(queue_data, 'workers_output.%s.bioportal_annotated' % bioportal_worker_id)
        for mesh_data in bioportal_annotated.get('data'):
            ontology_id = mesh_data.get('ontology_quote_id')
            if not bioportal_merged.get(ontology_id):
                if not mesh_names.get(ontology_id):
                    continue
                bioportal_merged[ontology_id] = {
                    'ontology_quote_id': ontology_id,
                    'matched_terms': [],
                    'total_frequency': 0,
                    'included_in_documents': 0,
                    'name': mesh_names.get(ontology_id)
                }
            bioportal_merged[ontology_id]['total_frequency'] += mesh_data.get('frequency')
            bioportal_merged[ontology_id]['included_in_documents'] += 1
            bioportal_merged[ontology_id]['matched_terms'] = list(set(mesh_data.get('matched_terms')+bioportal_merged[ontology_id]['matched_terms']))
    to_return_bioportal = sorted(
        bioportal_merged.values(), key=lambda k: k['included_in_documents'], reverse=True
    )
    return [{'group_tfidf': tfidf_results, 'bioportal_merged': to_return_bioportal},]
def process(specific_info, data, *args):
    inputs = get_inputs(data, specific_info)
    mongo_connection = args[0].get('mongo_connection')

    data_filter = inputs.get('data_filter', {})
    fields = inputs.get('fields', [])
    names = inputs.get('names', [])
    collection = inputs.get('collection', [])
    target_file = inputs.get('target_file')
    data_selector = dict([(field, 1) for field in fields])

    matrix_id = inputs.get('matrix_id')

    if matrix_id:
        matrix_documents = set()
        matrix = mongo_connection.matrix.find_one({'_id': ObjectId(inputs.get('matrix_id'))})
        matrix_documents |= set([
            matrix_el.get('id') for matrix_el in select_dict_el(matrix, 'matrix_dict.matrix', [])
        ])
        matrix_documents |= set([
            matrix_el.get('id')
            for matrix_el in select_dict_el(matrix, 'matrix_dict.studies_order', [])
        ])
        data_filter.update({"id": {"$in": list(matrix_documents)}})

    if target_file:
        target_file = open(target_file, 'w')
        target_file.write("\t".join(names)+"\n")

    for episte_data in mongo_connection[collection].find(data_filter, data_selector):
        if target_file:
            text = u"\t".join([
                to_unicode(
                    select_dict_el(episte_data, field) or ''
                ).replace('\r\n', ' ').replace('\n', '').replace('\t', ' ') for field in fields
            ]).encode('utf-8')
            target_file.write(text+"\n")
        else:
            yield dict(
                [
                    (names[i], (select_dict_el(episte_data, field) or ''))
                    for i, field in enumerate(fields)
                ]
            )
    if target_file:
        yield {'episte_data_target_file': inputs.get('target_file')}
def process(specific_info, data, *args):
    inputs = get_inputs(data, specific_info)

    document_id = inputs.get('document_id')
    to_find_phrases = inputs.get('to_find_phrases', {})
    where_to_find = inputs.get('where_to_find', [])

    document_text = (" ".join([(inputs.get(where) or '') for where in where_to_find])).lower()

    to_return = {'doc_id': document_id, 'phrases': {}}
    found_all = True
    for phrase_group, phrases in to_find_phrases.iteritems():
        to_return['phrases'][phrase_group] = False
        for phrase in phrases:
            if document_text.find(phrase.lower()) >= 0:
                to_return['phrases'][phrase_group] = True
        found_all = found_all and to_return['phrases'][phrase_group]
    to_return['all_phrases'] = found_all
    return [to_return]
def process(specific_info, data, *args):
    inputs = get_inputs(data, specific_info)

    queue = inputs.get('__read_from_queue')
    connection = args[0].get('connection')

    to_return_data = {
        'phrases': {
            'matching': {},
            'not_matching': {}
        },
        'all_phrases': {
            'matching': [],
            'not_matching': []
        }
    }

    queue_values = connection.zrange(queue, 0, -1)
    for queue_raw_data in queue_values:
        queue_data = json_loads(queue_raw_data).get('inputs')
        match_all = True
        for phrase_group, phrase_value in queue_data.get('phrases', {}).iteritems():
            if phrase_value:
                target = 'matching'
            else:
                target = 'not_matching'
            if not to_return_data['phrases'][target].get(phrase_group):
                to_return_data['phrases'][target][phrase_group] = []
            if queue_data['doc_id'] not in to_return_data['phrases'][target][phrase_group]:
                to_return_data['phrases'][target][phrase_group].append(queue_data['doc_id'])
            match_all = match_all and phrase_value
        target = 'not_matching'
        if match_all:
            target = 'matching'
        if queue_data['doc_id'] not in to_return_data['all_phrases'][target]:
            to_return_data['all_phrases'][target].append(queue_data['doc_id'])

    return [to_return_data,]