Beispiel #1
0
def parse():
    if request.method == 'POST':
        text = request.form['text']
        doc_id = '99999999'
        print text
        #if text == "readfile":
        #    with open('test.txt', 'r') as myfile:
        #        text=myfile.read()

        rules0 = request.form['rules0']
        rule0_lines = rules0.split("\n")
        rules1 = request.form['rules1']
        rule1_lines = rules1.split("\n")
        rules2 = request.form['rules2']
        rule2_lines = rules2.split("\n")
        
        param_helper = ParamHelper(text,doc_id,rule0_lines,rule1_lines,rule2_lines)
        raw_doc = document_pb2.Document()
        edg_rules = edgRules_pb2.EdgRules()
        param_helper.setDocProtoAttributes(raw_doc)
        param_helper.setRuleProtoAttributes(edg_rules)
        ##########################
        parse_bllip = parse_using_bllip(raw_doc,edg_rules)
        #print parse_bllip 
        brat_bllip = json.dumps(get_brat_data(parse_bllip))
        brat_bllip_added = json.dumps(get_brat_data_added(parse_bllip))

        return render_template('index_edg.html', text=text, rules0=rules0,rules1=rules1,rules2=rules2,
                               brat_string_bllip=brat_bllip,
                               brat_string_bllip_added=brat_bllip_added)
    else:
        return render_template('index_edg.html')
Beispiel #2
0
def parse():
    if request.method == 'POST':
        text = request.form['text']
        split = request.form.getlist('split')

        doc = document_pb2.Document()
        doc.text = text

        parse_doc_bllip_brat = ''
        parse_doc_stanford_brat = ''
        split_doc_brat = ''
        cst_parses = ''

        if len(split) == 0:
            parse_doc_bllip = parse_using_bllip(doc)
            parse_doc_stanford = parse_using_stanford(doc)
            parse_doc_bllip_brat = json.dumps(get_brat_data(parse_doc_bllip))
            parse_doc_stanford_brat = json.dumps(
                get_brat_data(parse_doc_stanford))
            parses = {}
            for sentence in parse_doc_bllip.sentence:
                parses[sentence.index] = sentence.parse
            cst_parses = json.dumps(parses)
        else:
            split_doc = split_sentence_using_stanford(doc)
            split_doc_brat = json.dumps(get_brat_data(split_doc))

        return render_template('index.html',
                               text=text,
                               parse_bllip=parse_doc_bllip_brat,
                               parse_stanford=parse_doc_stanford_brat,
                               split_stanford=split_doc_brat,
                               bllip_cst_parses=cst_parses)
    else:
        return render_template('index.html')
Beispiel #3
0
def run(text):

    raw_doc = document_pb2.Document()
    raw_doc.doc_id = '26815768'
    raw_doc.text = text

    # Parse using Bllip parser.
    result = parse_using_bllip(raw_doc)
    #print(result)
    return result
Beispiel #4
0
def run():
    text = u'MicroRNAs (miRNAs) are small non-coding RNAs of ∼19-24 ' \
           'nucleotides (nt) in length and considered as potent ' \
           'regulators of gene expression at transcriptional and ' \
           'post-transcriptional levels. Here we report the identification ' \
           'and characterization of 15 conserved miRNAs belonging to 13 ' \
           'families from Rauvolfia serpentina through in silico analysis ' \
           'of available nucleotide dataset. The identified mature R. ' \
           'serpentina miRNAs (rse-miRNAs) ranged between 20 and 22nt in ' \
           'length, and the average minimal folding free energy index (MFEI) ' \
           'value of rse-miRNA precursor sequences was found to be ' \
           '-0.815kcal/mol. Using the identified rse-miRNAs as query, their ' \
           'potential targets were predicted in R. serpentina and other plant ' \
           'species. Gene Ontology (GO) annotation showed that predicted ' \
           'targets of rse-miRNAs include transcription factors as well as ' \
           'genes involved in diverse biological processes such as primary ' \
           'and secondary metabolism, stress response, disease resistance, ' \
           'growth, and development. Few rse-miRNAs were predicted to target ' \
           'genes of pharmaceutically important secondary metabolic pathways ' \
           'such as alkaloids and anthocyanin biosynthesis. Phylogenetic ' \
           'analysis showed the evolutionary relationship of rse-miRNAs and ' \
           'their precursor sequences to homologous pre-miRNA sequences from ' \
           'other plant species. The findings under present study besides giving ' \
           'first hand information about R. serpentina miRNAs and their targets, ' \
           'also contributes towards the better understanding of miRNA-mediated ' \
           'gene regulatory processes in plants.'

    raw_doc = document_pb2.Document()
    raw_doc.doc_id = '26815768'
    raw_doc.text = text

    one_hundred_docs = [raw_doc] * 100

    # This is a simple function to make requests out of a list of documents. We
    # put 5 documents in each request.
    requests = request_iter_docs(one_hundred_docs,
                                 request_size=5,
                                 request_type=rpc_pb2.Request.PARSE_BLLIP)

    # Given a request iterator, send requests in parallel and get responses.
    responses_queue = grpcapi.get_queue(server='128.4.20.169',
                                        port=8900,
                                        request_thread_num=10,
                                        iterable_request=requests)
    count = 0
    for response in responses_queue:
        for doc in response.document:
            count += 1
            print(count, doc.doc_id, len(doc.sentence))
Beispiel #5
0
def run():
    text = u'MicroRNAs (miRNAs) are small non-coding RNAs of ∼19-24 ' \
           'nucleotides (nt) in length and considered as potent ' \
           'regulators of gene expression at transcriptional and ' \
           'post-transcriptional levels. Here we report the identification ' \
           'and characterization of 15 conserved miRNAs belonging to 13 ' \
           'families from Rauvolfia serpentina through in silico analysis ' \
           'of available nucleotide dataset. The identified mature R. ' \
           'serpentina miRNAs (rse-miRNAs) ranged between 20 and 22nt in ' \
           'length, and the average minimal folding free energy index (MFEI) ' \
           'value of rse-miRNA precursor sequences was found to be ' \
           '-0.815kcal/mol. Using the identified rse-miRNAs as query, their ' \
           'potential targets were predicted in R. serpentina and other plant ' \
           'species. Gene Ontology (GO) annotation showed that predicted ' \
           'targets of rse-miRNAs include transcription factors as well as ' \
           'genes involved in diverse biological processes such as primary ' \
           'and secondary metabolism, stress response, disease resistance, ' \
           'growth, and development. Few rse-miRNAs were predicted to target ' \
           'genes of pharmaceutically important secondary metabolic pathways ' \
           'such as alkaloids and anthocyanin biosynthesis. Phylogenetic ' \
           'analysis showed the evolutionary relationship of rse-miRNAs and ' \
           'their precursor sequences to homologous pre-miRNA sequences from ' \
           'other plant species. The findings under present study besides giving ' \
           'first hand information about R. serpentina miRNAs and their targets, ' \
           'also contributes towards the better understanding of miRNA-mediated ' \
           'gene regulatory processes in plants.'

    raw_doc = document_pb2.Document()
    raw_doc.doc_id = '26815768'
    raw_doc.text = text

    # Parse using Bllip parser.
    result = parse_using_bllip(raw_doc)
    print(result)

    # Parse Using Stanford CoreNLP parser.
    result = parse_using_stanford(raw_doc)
    print(result)

    # Only split sentences using Stanford CoreNLP.
    for i in range(100):
        result = split_using_stanford(raw_doc)
        print('Split {} documents'.format(i))
Beispiel #6
0
def run():
    textFH = open(sys.argv[1], "r")
    text = textFH.read()
    textFH.close()
    #text = u'Surface expression of mir-21 activates tgif beta receptor type II expression. Expression of mir-21 and mir-132  directly mediates cell migration . mir-21 mediates cell migration and proliferation. mir-21 seems to mediate apoptosis. mir-21 is  involved in cellular processes, such as cell migration and cell proliferation. mir-21 regulates the ectopic expression of smad2 .'
    #text = u'transport of annexin 2 not only to dynamic actin-rich ruffles at the cell cortex but also to cytoplasmic and perinuclear vesicles.'
    doc_id = '99999999'
    rule_phase0_filename = sys.argv[2]
    rule_phase1_filename = sys.argv[3]
    rule_phase2_filename = sys.argv[4]
    fh0 = open(rule_phase0_filename, "r")
    rule0_lines = fh0.readlines()
    fh0.close()
    fh1 = open(rule_phase1_filename, "r")
    rule1_lines = fh1.readlines()
    fh1.close()
    fh2 = open(rule_phase2_filename, "r")
    rule2_lines = fh2.readlines()
    fh2.close()
    param_helper = ParamHelper(text, doc_id, rule0_lines, rule1_lines,
                               rule2_lines)

    raw_doc = document_pb2.Document()
    edg_rules = edgRules_pb2.EdgRules()

    param_helper.setDocProtoAttributes(raw_doc)
    param_helper.setRuleProtoAttributes(edg_rules)

    # Parse using Bllip parser.
    #print (ruleList)
    # Parse using Bllip parser.
    result = parse_using_bllip(raw_doc, edg_rules)
    helper = DocHelper(result)
    sentences = result.sentence
    #print(edg_rules)
    for sentence in sentences:
        print(helper.text(sentence))
        for depExtra in sentence.dependency_extra:
            print(helper.printExtraDependency(sentence, depExtra))
        print("===============================")
    def mask_entity(self, mask_duids=None):
        if self.has_overlap_entity():
            raise ValueError('Overlapped entities: ' + self.doc.doc_id)

        masked = document_pb2.Document()
        masked.CopyFrom(self.doc)

        slices = []
        start = 0
        mask_start = 0

        # Sort by char start.
        entities = masked.entity.values()
        entities = sorted(entities, key=lambda a: a.char_start)

        for entity in entities:
            slices.append(self.doc.text[start:entity.char_start])
            mask_start += len(slices[-1])

            if mask_duids is not None and entity.duid not in mask_duids:
                slices.append(self.text(entity))
            else:
                # Not using entity type as replacement because it may change
                # the parsing, ENTITY seems to affect the parsing less.
                if self.text(entity).endswith('s'):
                    slices.append('BIOENTITIES')
                else:
                    slices.append('BIOENTITY')

            entity_end = entity.char_end
            entity.char_start = mask_start
            entity.char_end = mask_start + len(slices[-1]) - 1

            mask_start += len(slices[-1])
            start = entity_end + 1

        slices.append(self.doc.text[start:])

        masked.text = ''.join(slices)
        return masked
Beispiel #8
0
def run():
    # text = u'Surface expression of mir-21 activates tgif beta receptor type II expression. Expression of mir-21 and mir-132  directly mediates cell migration . mir-21 mediates cell migration and proliferation. mir-21 seems to mediate apoptosis. mir-21 is  involved in cellular processes, such as cell migration and cell proliferation. mir-21 regulates the ectopic expression of smad2 .'
    # text = u'transport of annexin 2 not only to dynamic actin-rich ruffles at the cell cortex but also to cytoplasmic and perinuclear vesicles.'
    doc_id = '99999999'
    rule_phase0_filename = '/home/leebird/Projects/nlputils/visual/uploads/rules_phase0.txt'
    rule_phase1_filename = '/home/leebird/Projects/nlputils/visual/uploads/rules_phase1.txt'
    rule_phase2_filename = '/home/leebird/Projects/nlputils/visual/uploads/rules_phase2.txt'
    fh0 = open(rule_phase0_filename, "r")
    rule0_lines = fh0.readlines()
    fh0.close()
    fh1 = open(rule_phase1_filename, "r")
    rule1_lines = fh1.readlines()
    fh1.close()
    fh2 = open(rule_phase2_filename, "r")
    rule2_lines = fh2.readlines()
    fh2.close()

    with open('/home/leebird/Projects/nlputils/utils/typing/test.json') as f:
        json_doc = json.load(f)
        for t in json_doc['entity'].values():
            t['entityType'] = t['entityType'].upper()
        text = json.dumps(json_doc)
        raw_doc = json_format.Parse(text, document_pb2.Document(), True)

    param_helper = ParamHelper(text, doc_id, rule0_lines, rule1_lines,
                               rule2_lines)

    # raw_doc = document_pb2.Document()
    edg_rules = edgRules_pb2.EdgRules()

    # param_helper.setDocProtoAttributes(raw_doc)
    param_helper.setRuleProtoAttributes(edg_rules)

    # Parse using Bllip parser.
    doc = parse_using_bllip(raw_doc, edg_rules)
    helper = DocHelper(doc)
    invalid_deps = constraint_args(helper, {'arg0': {document_pb2.Entity.GENE}})
    print(invalid_deps)
    propagate(helper, {'arg0': {document_pb2.Entity.GENE}}, invalid_deps)
Beispiel #9
0
def upload():
    if request.method == 'POST':
        # Get the name of the uploaded file
        file0 = request.files['ruleFile0']
        file1 = request.files['ruleFile1']
        file2 = request.files['ruleFile2']
        rules0 = save_read_uploaded_file(file0)
        rules1 = save_read_uploaded_file(file1)
        rules2 = save_read_uploaded_file(file2)

        text = request.form['text']
        if rules0 == "":
            rules0 = request.form['rules0']
        if rules1 == "":
            rules1 = request.form['rules1']
        if rules2 == "":
            rules2 = request.form['rules2']
        rule0_lines = rules0.split("\n")
        rule1_lines = rules1.split("\n")
        rule2_lines = rules2.split("\n")
        doc_id = "9999999" 
        param_helper = ParamHelper(text,doc_id,rule0_lines,rule1_lines,rule2_lines)
        raw_doc = document_pb2.Document()
        edg_rules = edgRules_pb2.EdgRules()
        param_helper.setDocProtoAttributes(raw_doc)
        param_helper.setRuleProtoAttributes(edg_rules)
        ##########################
        parse_bllip = parse_using_bllip(raw_doc,edg_rules)
        #print parse_bllip 
        brat_bllip = json.dumps(get_brat_data(parse_bllip))
        brat_bllip_added = json.dumps(get_brat_data_added(parse_bllip))

        return render_template('index_edg.html', text=text, rules0=rules0,rules1=rules1,rules2=rules2,
                               brat_string_bllip=brat_bllip,
                               brat_string_bllip_added=brat_bllip_added)
    else:
        return render_template('index_edg.html')
def run():

    #####Iterate through all files in Input directory and create doc_list
    input_dir_path = sys.argv[1]
    glob_path = input_dir_path + "/*"
    input_files = glob.glob(glob_path)
    document_list = list()
    for input_file in input_files:
        textFH = open(input_file, "r")
        text = textFH.read()
        textFH.close()
        raw_doc = document_pb2.Document()
        raw_doc = document_pb2.Document()
        doc_id = os.path.splitext(os.path.basename(input_file))[0]
        raw_doc.text = text
        raw_doc.doc_id = doc_id
        document_list.append(raw_doc)

    rule_phase0_filename = sys.argv[2]
    fh0 = open(rule_phase0_filename, "r")
    rule0_lines = fh0.readlines()
    fh0.close()

    ####NEED TO UPDDATE PARAM_HELPER
    param_helper = ParamHelper("NA", "NA", rule0_lines, [], [])
    edg_rules = edgRules_pb2.EdgRules()

    param_helper.setRuleProtoAttributes(edg_rules)
    #param_helper.setDocProtoAttributes(raw_doc)

    # This is a simple function to make requests out of a list of documents. We
    # put 5 documents in each request.
    requests = edg_request_iter_docs(
        document_list,
        edg_rules,
        request_size=5,
        request_type=rpc_pb2.EdgRequest.PARSE_BLLIP)

    # Given a request iterator, send requests in parallel and get responses.
    responses_queue = grpcapi.get_queue(server='128.4.20.169',
                                        port=8902,
                                        request_thread_num=10,
                                        iterable_request=requests,
                                        edg_request_processor=True)
    count = 0
    for response in responses_queue:
        for doc in response.document:
            #print(doc)
            helper = DocHelper(doc)
            sentences = doc.sentence
            doc_id = doc.doc_id
            #print(edg_rules)
            sentNum = 0
            for sentence in sentences:
                flag = 0
                sentText = helper.text(sentence)
                dependenciesExtra = sentence.dependency_extra
                edgRelations = EdgRelations(doc_id, sentNum)
                edgRelations.setRelations(helper, sentence, dependenciesExtra)

                toPrintRel = ["inv", "reg", "ass", "exp", "cmp", "isa", "fnd"]
                for edgRelation in edgRelations.relations:
                    numb_args_list = edgRelation.getEdgRelationNumArgs()
                    relation_name = edgRelation.name
                    trigger_head = edgRelation.trigger_head
                    trigger_phrase = edgRelation.trigger_phrase
                    if relation_name in toPrintRel:
                        for numb_args in numb_args_list:
                            print("Sentence: " + doc_id + "\t" + str(sentNum) +
                                  "\t" + sentText)
                            print("Relation: " + relation_name + "\t" +
                                  trigger_head + "\t" + trigger_phrase)
                            print("Arg0: " + numb_args[0])
                            print("Arg1: " + numb_args[1])
                            print("Arg2: " + numb_args[2])
                            print("\n")
                sentNum = sentNum + 1
            count += 1
    def load_from_brat_file(doc_id, text_file, annotation_file):
        doc = document_pb2.Document()
        doc.doc_id = doc_id
        helper = DocHelper(doc)
        with codecs.open(text_file, 'r', encoding='utf8') as f:
            # Replace newlines with spaces.
            text = f.read().replace('\n', ' ')
            doc.text = text

        with codecs.open(annotation_file, 'r', encoding='utf8') as f:
            entities, events, relations = [], [], []

            for line in f:
                line = line.strip('\r\n')

                assert len(line.strip()) > 0
                assert line[0] == 'T' or line[0] == 'E' or \
                       line[0] == 'R' or line[0] == '*' or \
                       line[0] == 'M' or line[0] == 'A'

                if line[0] == 'T':
                    entity_id, entity_text, entity_type, entity_start, entity_end \
                        = parser.parse_entity(line)

                    entity = helper.add_entity(duid=entity_id)
                    entity.char_start = entity_start
                    entity.char_end = entity_end - 1
                    entity.entity_type = entity_type

                elif line[0] == 'E':
                    events.append(parser.parse_event(line))
                elif line[0] == 'R' or line[0] == '*':
                    relations.append(parser.parse_relation(line))

            for eid, etype, trigger_id, arguments, attrs in events:
                event = helper.add_relation(relation_type=etype, duid=eid)
                trigger = event.argument.add()
                trigger.entity_duid = trigger_id
                trigger.role = 'Trigger'
                for role, arg_id in arguments:
                    arg = event.argument.add()
                    arg.role = role
                    arg.entity_duid = arg_id

                if attrs is not None:
                    for key, values in attrs.items():
                        for value in values:
                            attr = event.attribute.add()
                            attr.key = key
                            attr.value = value

            for rid, rtype, arguments, attrs in relations:
                if rid.startswith('R'):
                    relation = helper.add_relation(relation_type=rtype,
                                                   duid=rid)
                else:
                    relation = helper.add_relation(relation_type=rtype)
                for role, arg_id in arguments:
                    arg = relation.argument.add()
                    arg.role = role
                    arg.entity_duid = arg_id

                if attrs is not None:
                    for key, values in attrs.items():
                        for value in values:
                            attr = relation.attribute.add()
                            attr.key = key
                            attr.value = value
        return doc
def run():
    
    #####Iterate through all files in Input directory and create doc_list
    input_dir_path = sys.argv[1]
    glob_path = input_dir_path + "/*";
    input_files = glob.glob(glob_path)
    document_list = list()
    for input_file in input_files:
        textFH = open(input_file,"r")
        text = textFH.read()
        textFH.close()
        raw_doc = document_pb2.Document()
        raw_doc = document_pb2.Document()
        doc_id = os.path.splitext(os.path.basename(input_file))[0]
        raw_doc.text = text 
        raw_doc.doc_id = doc_id 
        document_list.append(raw_doc)

    rule_phase0_filename = sys.argv[2]
    fh0 = open(rule_phase0_filename, "r")
    rule0_lines = fh0.readlines()
    fh0.close()
    
    ####NEED TO UPDDATE PARAM_HELPER
    param_helper = ParamHelper("NA","NA",rule0_lines,[],[])
    edg_rules = edgRules_pb2.EdgRules()

    param_helper.setRuleProtoAttributes(edg_rules)
    #param_helper.setDocProtoAttributes(raw_doc)

    # This is a simple function to make requests out of a list of documents. We
    # put 5 documents in each request.
    requests = edg_request_iter_docs(document_list, edg_rules,
                                 request_size=5,
                                 request_type=rpc_pb2.EdgRequest.PARSE_BLLIP)

    # Given a request iterator, send requests in parallel and get responses.
    responses_queue = grpcapi.get_queue(server='128.4.20.169',
                                        port=8902,
                                        request_thread_num=10,
                                        iterable_request=requests,
                                        edg_request_processor=True)
    count = 0
    for response in responses_queue:
        for doc in response.document:
            #print(doc)
            helper = DocHelper(doc)
            sentences = doc.sentence
            doc_id = doc.doc_id
            #print(edg_rules)
            sentNum = 0
            for sentence in sentences:
                flag = 0
                sentText = helper.text(sentence)
                for depExtra in sentence.dependency_extra:
                    flag = 1
                    print(doc_id+"\t"+str(sentNum)+"\t"+helper.printExtraDependencyAnalysis(sentence,depExtra)+"\t"+sentText)
                if flag == 0:
                    print(doc_id+"\t"+str(sentNum)+"\t"+helper.printEmptyExtraDependencyAnalysis(sentence)+"\t"+sentText)
                sentNum = sentNum + 1
            count += 1