Beispiel #1
0
    def __init__(self,
                 nlp: Language = None,
                 support_overlap: bool = False,
                 log_level: int = logging.WARNING,
                 encoding: str = None,
                 doc_name_depth: int = 0,
                 **kwargs):
        """

        @param nlp: Spacy Language model
        @param support_overlap: whether need to support overlapped annotations
        @param log_level: logging level configuration
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param kwargs:other parameters
        """
        for param_name, value in kwargs.items():
            setattr(self, param_name, value)
        if nlp is None:
            raise NameError('parameter "nlp" need to be defined')
        self.nlp = nlp
        self.encoding = encoding
        self.doc_name_depth = doc_name_depth
        self.support_overlap = support_overlap
        self.set_logger(log_level)
        if not Doc.has_extension('doc_name'):
            Doc.set_extension('doc_name', default='')
        pass
    def __init__(self):
        super().__init__()

        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])

        if not Token.has_extension('is_lexical'):
            Token.set_extension('is_lexical', default=False)
    def __init__(self, paths=None):
        """
        paths:list -> a list of string, each of which represents a path to one of the corpora needed as listed below.
        This method initialized constant accross the object to be used by other methods of this object
        """

        super().__init__()
        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])
            Doc.set_extension('ngsl_words', default=[])
            Doc.set_extension('nawl_words', default=[])
            Doc.set_extension('tsl_words', default=[])
            Doc.set_extension('fpc_words', default=[])
            Doc.set_extension('cocaacad_words', default=[])
            Doc.set_extension('cocatech_words', default=[])
            Doc.set_extension('cocagenband1_words', default=[])
            Doc.set_extension('cocagenband2_words', default=[])
            Doc.set_extension('cocagenband3_words', default=[])

        if paths is None:
            #file locations
            self.fnameNGSL = os.path.join(
                os.path.dirname(__file__),
                'Corpora/NGSL+1.01+by+band - Frequency.csv')
            self.fnameNAWL = os.path.join(os.path.dirname(__file__),
                                          'Corpora/NAWL_SFI.csv')
            self.fnameBSL = os.path.join(
                os.path.dirname(__file__),
                'Corpora/BSL_1.01_SFI_freq_bands.csv')
            self.fnameTSL = os.path.join(
                os.path.dirname(__file__),
                'Corpora/TSL+1.1+Ranked+by+Frequency - TSL.csv')
            self.fnameCOCAAcad = os.path.join(os.path.dirname(__file__),
                                              'Corpora/COCA Academic.csv')
            self.fnameCOCATech = os.path.join(os.path.dirname(__file__),
                                              'Corpora/COCA Technical.csv')
            self.fnameCOCAGen = os.path.join(os.path.dirname(__file__),
                                             'Corpora/COCA General.csv')
        else:
            #file locations passed as a parameter to the construct
            self.fnameNGSL = paths[0]
            self.fnameNAWL = paths[1]
            self.fnameBSL = paths[2]
            self.fnameTSL = paths[3]
            self.fnameCOCAAcad = paths[4]
            self.fnameCOCATech = paths[5]
            self.fnameCOCAGen = paths[6]

        ## Taken by Vishal's code.
        self.NGSLTotal = 273613534
        self.NAWLTotal = 288176225
        self.TSLTotal = 1560194
        self.BSLTotal = 64651722
        self.COCAAcadTotal = 120032441

        # read the corpora
        self.read_corpora()
        self.nlp = spacy.load("en_core_web_sm")
Beispiel #4
0
    def __init__(self, clf, extension='score'):
        """

        :type clf: Classifier, needs to have a predict(X) function
        """
        self.clf = clf
        self.extension = extension
        if not Doc.has_extension(extension):
            Doc.set_extension(extension, default=-1)
Beispiel #5
0
    def __init__(self):
        if not Doc.has_extension('taaled_lemmas'):
            Doc.set_extension('taaled_lemmas', default=[])

        if not Doc.has_extension('context_tokens'):
            Doc.set_extension('context_tokens', default=[])

        if not Doc.has_extension('function_tokens'):
            Doc.set_extension('function_tokens', default=[])

        # Load TAALED word list files
        # source: https://github.com/kristopherkyle/TAALED/tree/master/TAALED_1_3_1_Py3/dep_files
        module_path = os.path.abspath(os.path.dirname(__file__))
        adj_lem_list_path = os.path.join(module_path,
                                         "Corpora/adj_lem_list.txt")
        real_words_path = os.path.join(module_path, "Corpora/real_words.txt")

        self.adj_word_list = open(adj_lem_list_path, "r",
                                  errors='ignore').read().split("\n")[:-1]
        self.real_word_list = open(real_words_path, "r",
                                   errors='ignore').read().split("\n")[:-1]
Beispiel #6
0
    def test_docs_to_sents_df(self):
        if Doc.has_extension("concepts"):
            Doc.remove_extension("concepts")
        dir_reader = EhostDirReader(nlp=self.nlp, support_overlap=False,
                                    recursive=True,
                                    schema_file='data/ehost_test_corpus/config/projectschema.xml')

        docs = dir_reader.read(txt_dir='data/ehost_test_corpus/')
        df = Vectorizer.docs_to_sents_df(docs, type_filter=set(), track_doc_name=True)
        print(df)
        assert (df.shape[0] == 12)
        df = Vectorizer.docs_to_sents_df(docs, type_filter=set())
        print(df)
        df = Vectorizer.docs_to_sents_df(docs, sent_window=2)
        assert (df.shape[0] == 20)
Beispiel #7
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Beispiel #8
0
 def init_component(self):
     if not Doc.has_extension("extract_keywords"):
         Doc.set_extension("extract_keywords", method=self.extract_keywords)
     if not Doc.has_extension("kw_candidates"):
         Doc.set_extension("kw_candidates", default=None)
    def __init__(self):
        if not Doc.has_extension(self.name):
            Doc.set_extension(self.name, default=[])

        if not Doc.has_extension(self.name + '_legacy'):
            Doc.set_extension(self.name + '_legacy', default=[])
Beispiel #10
0
 def __init__(self):
     if not Doc.has_extension("features"):
         Doc.set_extension("features", default=OrderedDict())