Esempio n. 1
0
def stop_word_tagger_hub(input_dict):
    """
    Apply the *stop_word_tagger* object on the Annotated Document Corpus (*adc*):

    1. first select only annotations of type Token Annotation *element_annotation*,
    2. apply the stop_word tagger
    3. create new annotations *output_feature* with the outputs of the stop word tagger.

    :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    :param stop_word_tagger: A python dictionary containing the stop word tagger object and its arguments.
    :param element_annotation: Which annotated part of document to be searched for stopwords.
    :param output_features: How to annotate the newly discovered stop word features.

    :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    """

    if isinstance(input_dict['stop_word_tagger'],LatinoObject):
        from ...latino.library_gen import latino_tag_adcstopwords
        input_dict['tagger']=input_dict['stop_word_tagger']  #TODO temporary
        return executeFunction.apply_async([latino_tag_adcstopwords,input_dict],queue="windows").wait() if settings.USE_WINDOWS_QUEUE \
            else latino_tag_adcstopwords(input_dict)

    else:
        adc = input_dict['adc']
        tagger_dict = input_dict['stop_word_tagger']
        input_annotation = input_dict['element_annotation']
        output_annotation = input_dict['output_feature']
        return universal_word_tagger_hub(adc,tagger_dict,input_annotation,output_annotation)
Esempio n. 2
0
def stop_word_tagger_hub(input_dict):
    """
    Apply the *stop_word_tagger* object on the Annotated Document Corpus (*adc*):

    1. first select only annotations of type Token Annotation *element_annotation*,
    2. apply the stop_word tagger
    3. create new annotations *output_feature* with the outputs of the stop word tagger.

    :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    :param stop_word_tagger: A python dictionary containing the stop word tagger object and its arguments.
    :param element_annotation: Which annotated part of document to be searched for stopwords.
    :param output_features: How to annotate the newly discovered stop word features.

    :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    """

    if isinstance(input_dict['stop_word_tagger'], LatinoObject):
        from ...latino.library_gen import latino_tag_adcstopwords
        input_dict['tagger'] = input_dict['stop_word_tagger']  #TODO temporary
        return executeFunction.apply_async([latino_tag_adcstopwords,input_dict],queue="windows").wait() if settings.USE_WINDOWS_QUEUE \
            else latino_tag_adcstopwords(input_dict)

    else:
        adc = input_dict['adc']
        tagger_dict = input_dict['stop_word_tagger']
        input_annotation = input_dict['element_annotation']
        output_annotation = input_dict['output_feature']
        return universal_word_tagger_hub(adc, tagger_dict, input_annotation,
                                         output_annotation)
Esempio n. 3
0
def stem_lemma_tagger_hub(input_dict):
    if isinstance(input_dict['tagger'],LatinoObject): #check if this is a latino object
        from ...latino.library_gen import latino_tag_adc_stem_lemma
        return latino_tag_adc_stem_lemma(input_dict) if not settings.USE_WINDOWS_QUEUE \
            else executeFunction.apply_async([latino_tag_adc_stem_lemma,input_dict],queue="windows").wait()
    else:
        adc = input_dict['adc']
        tagger_dict = input_dict['tagger']
        input_annotation = input_dict['element_annotation']
        output_annotation = input_dict['output_feature']
        return universal_word_tagger_hub(adc,tagger_dict,input_annotation,output_annotation)
Esempio n. 4
0
def stem_lemma_tagger_hub(input_dict):
    if isinstance(input_dict['tagger'],
                  LatinoObject):  #check if this is a latino object
        from ...latino.library_gen import latino_tag_adc_stem_lemma
        return latino_tag_adc_stem_lemma(input_dict) if not settings.USE_WINDOWS_QUEUE \
            else executeFunction.apply_async([latino_tag_adc_stem_lemma,input_dict],queue="windows").wait()
    else:
        adc = input_dict['adc']
        tagger_dict = input_dict['tagger']
        input_annotation = input_dict['element_annotation']
        output_annotation = input_dict['output_feature']
        return universal_word_tagger_hub(adc, tagger_dict, input_annotation,
                                         output_annotation)
Esempio n. 5
0
def tokenizer_hub(input_dict):
    """
    Apply the *tokenizer* object on the Annotated Document Corpus (*adc*):

    1. first select only annotations of type *input_annotation*,
    2. apply the tokenizer
    3. create new annotations *output_annotation* with the outputs of the tokenizer.

    :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    :param tokenizer: A python dictionary containing the Tokenizer object and its arguments.
    :param input_annotation: Which annotated part of document to be splitted.
    :param output_annotation: How to annotate the newly discovered tokens.

    :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    """
    tokenizer_dict = input_dict['tokenizer']

    if type(tokenizer_dict) != dict:
        from workflows.tasks import executeFunction
        from tf_latino.latino.library_gen import latino_tokenize_words

        return latino_tokenize_words(input_dict) if not settings.USE_WINDOWS_QUEUE \
            else executeFunction.apply_async([latino_tokenize_words,input_dict],queue="windows").wait()
    else:
        tokenizer = tokenizer_dict['object']
        args = tokenizer_dict.get('args', [])
        kwargs = tokenizer_dict.get('kargs', {})
        input_annotation = input_dict['input_annotation']
        output_annotation = input_dict['output_annotation']
        adc = input_dict['adc']
        docs_count = len(adc.documents)
        for i, document in enumerate(adc.documents):
            if document.features['contentType'] == "Text":
                if not document.text:
                    pass
                for annotation, subtext in document.get_annotations_with_text(
                        input_annotation):  #all annotations of this type
                    new_token_spans = tokenizer.span_tokenize(
                        subtext, *args, **kwargs)
                    for starts_at, ends_at in new_token_spans:
                        document.annotations.append(
                            Annotation(annotation.span_start + starts_at,
                                       annotation.span_start + ends_at - 1,
                                       output_annotation))
            if i % 100 == 0:
                print int((i + 1) * 1.0 / docs_count * 100)
            #widget.progress = int((i+1)*1.0/*100)
            #widget.save()
        return {'adc': adc}
Esempio n. 6
0
def stem_lemma_tagger_hub(input_dict):
    if input_dict[
            'tagger'].__class__.__name__ == "LatinoObject":  #check if this is a latino object
        from tf_latino.latino.library_gen import latino_tag_adc_stem_lemma
        from workflows.tasks import executeFunction

        return latino_tag_adc_stem_lemma(input_dict) if not settings.USE_WINDOWS_QUEUE \
            else executeFunction.apply_async([latino_tag_adc_stem_lemma,input_dict],queue="windows").wait()
    else:
        adc = input_dict['adc']
        tagger_dict = input_dict['tagger']
        input_annotation = input_dict['element_annotation']
        pos_annotation = input_dict.get('pos_annotation')
        output_annotation = input_dict['output_feature']
        return universal_word_tagger_hub(adc, tagger_dict, input_annotation,
                                         output_annotation, pos_annotation)
Esempio n. 7
0
def tokenizer_hub(input_dict):
    """
    Apply the *tokenizer* object on the Annotated Document Corpus (*adc*):

    1. first select only annotations of type *input_annotation*,
    2. apply the tokenizer
    3. create new annotations *output_annotation* with the outputs of the tokenizer.

    :param adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    :param tokenizer: A python dictionary containing the Tokenizer object and its arguments.
    :param input_annotation: Which annotated part of document to be splitted.
    :param output_annotation: How to annotate the newly discovered tokens.

    :returns adc: Annotated Document Corpus (workflows.textflows.DocumentCorpus)
    """
    tokenizer_dict = input_dict['tokenizer']

    if type(tokenizer_dict)!=dict:
        from ...latino.library_gen import latino_tokenize_words
        return latino_tokenize_words(input_dict) if not settings.USE_WINDOWS_QUEUE \
            else executeFunction.apply_async([latino_tokenize_words,input_dict],queue="windows").wait()
    else:
        tokenizer=tokenizer_dict['object']
        args=tokenizer_dict.get('args',[])
        kwargs=tokenizer_dict.get('kargs',{})
        input_annotation = input_dict['input_annotation']
        output_annotation = input_dict['output_annotation']
        adc = input_dict['adc']
        docs_count=len(adc.documents)
        for i,document in enumerate(adc.documents):
            if document.features['contentType'] == "Text":
                if not document.text:
                    pass
                for annotation,subtext in document.get_annotations_with_text(input_annotation): #all annotations of this type
                    new_token_spans=tokenizer.span_tokenize(subtext,*args,**kwargs)
                    for starts_at,ends_at in new_token_spans:
                        document.annotations.append(Annotation(annotation.span_start+starts_at,annotation.span_start+ends_at-1,output_annotation))
            if i%100==0:
                print int((i+1)*1.0/docs_count*100)
            #widget.progress = int((i+1)*1.0/*100)
            #widget.save()
        return {'adc': adc}
Esempio n. 8
0
def pos_tagger_hub(input_dict):
    if isinstance(input_dict['pos_tagger'],LatinoObject): #check if this is a latino object
        from ...latino.library_gen import latino_pos_tag
        adc= executeFunction.apply_async([latino_pos_tag,input_dict],queue="windows").wait()['adc'] \
            if settings.USE_WINDOWS_QUEUE else latino_pos_tag(input_dict)
    else:
        adc= universal_sentence_tagger_hub(input_dict)['adc']

    number_of_letters=int(input_dict['num_of_letters'])
    if number_of_letters!=-1:
        element_annotation_name = input_dict['element_annotation']
        output_annotation_name = input_dict['output_feature']
        for doc in adc.documents:
            for annotation in doc.get_annotations(element_annotation_name):
                if not output_annotation_name in annotation.features:
                    print input_dict['pos_tagger'],annotation.features
                    print doc.features
                else:
                    annotation.features[output_annotation_name]=annotation.features[output_annotation_name][0:number_of_letters]

    return {'adc': adc }
def pos_tagger_hub(input_dict):
    if input_dict['pos_tagger'].__class__.__name__=="LatinoObject": #check if this is a latino object
        from tf_latino.latino.library_gen import latino_pos_tag
        from workflows.tasks import executeFunction

        adc= executeFunction.apply_async([latino_pos_tag,input_dict],queue="windows").wait()['adc'] \
            if settings.USE_WINDOWS_QUEUE else latino_pos_tag(input_dict)
    else:
        adc= universal_sentence_tagger_hub(input_dict)['adc']

    number_of_letters=int(input_dict['num_of_letters'])
    if number_of_letters!=-1:
        element_annotation_name = input_dict['element_annotation']
        output_annotation_name = input_dict['output_feature']
        for doc in adc.documents:
            for annotation in doc.get_annotations(element_annotation_name):
                if not output_annotation_name in annotation.features:
                    print input_dict['pos_tagger'],annotation.features
                    print doc.features
                else:
                    annotation.features[output_annotation_name]=annotation.features[output_annotation_name][0:number_of_letters]

    return {'adc': adc }