Ejemplo n.º 1
0
    def __init__(self):

        Corpus.__init__(self)

        self.scslabels = []
        self.cslabel_count = Counter()
        self.cstype_count = Counter()

        self.lang_pair_count = Counter()

        self.multilingual_sentences = set()
Ejemplo n.º 2
0
 def __init__(self, path, dirname, datatype):
     import os
     from os.path import join, isfile, isdir
     self.path = join(dirname, path)
     kwargs = {'print_info': False, 'level': 'f', 'datatype': datatype}
     Corpus.__init__(self, self.path, **kwargs)
     if self.path.endswith('.p'):
         self.datatype = 'tokens'
     elif self.path.endswith('.xml'):
         self.datatype = 'parse'
     else:
         self.datatype = 'plaintext'
Ejemplo n.º 3
0
 def __init__(self, path, dirname):
     import os
     from os.path import join, isfile, isdir
     self.path = join(dirname, path)
     kwargs = {'print_info': False, 'level': 'f'}
     Corpus.__init__(self, self.path, **kwargs)
     if self.path.endswith('.p'):
         self.datatype = 'tokens'
     elif self.path.endswith('.xml'):
         self.datatype = 'parse'
     else:
         self.datatype = 'plaintext'
Ejemplo n.º 4
0
    def __init__(self, path, dirname, datatype):
        import os
        from os.path import join, isfile, isdir

        self.path = join(dirname, path)
        kwargs = {"print_info": False, "level": "f", "datatype": datatype}
        Corpus.__init__(self, self.path, **kwargs)
        if self.path.endswith(".p"):
            self.datatype = "tokens"
        elif self.path.endswith(".xml"):
            self.datatype = "parse"
        else:
            self.datatype = "plaintext"
Ejemplo n.º 5
0
    def __init__(self, char_dictionary=(None, None), label_dictionary=(None, None)):
        """Reads in a corpus file and sets the corpus variables.
    
        Keyword arguments:
        char_dictionary -- A tuple of dictionaries for characters to indices 
                           and indices to characters
        label_dictionary -- A tuple of dictionaries for labels to indices 
                           and indices to labels

        """
        label2idx = ({'<PAD>':0, 'lang1': 1, 'lang2':2, 'other':3, 'ne':4, 
        'ambiguous':5, 'fw':6, 'mixed':7, 'unk':8})
        idx2label = {i:l for l, i in self.label2idx.items()}

        Corpus.__init__(self, label_dictionary=(label2idx, idx2label))
Ejemplo n.º 6
0
    def __init__(self, articles):
        Corpus.__init__(self, articles)
        for article in articles:
            features = get_features(
                article)  # Get the feature values for the current article.
            if article.train:  # put feature dict in either testing or training.
                self.train_feats.append(features)
                self.train_articles.append(
                    article
                )  # keep a list of all articles in the training set.
            else:
                self.test_feats.append(features)
                self.test_articles.append(
                    article)  # keep a list of all articles in the testing set.

        self.feat_names = features.keys()
Ejemplo n.º 7
0
 def __init__(self, dictionary=None, corpus=None, index_file=None, max_docs=None, **kwargs):
     Corpus.__init__(self, dictionary=dictionary, corpus=corpus)
     self.clip_corpus(max_docs)
     # Set up for KNN
     features = len(self.dictionary)
     self.index = AnnoyIndex(features)
     start_time = datetime.datetime.now()
     if not index_file:
         self.transform_corpus(models.TfidfModel)
         for i, vector in enumerate(self):
             self.index.add_item(i, list(sparse2full(vector, features).astype(float)))
         self.index.build(self.no_trees)
     else:
         self.index.load(index_file)
     end_time = datetime.datetime.now()
     self.train_time = end_time - start_time
     return
Ejemplo n.º 8
0
    def __init__(self, articles, taglist, kind="wordlist"):
        Corpus.__init__(self, articles)
        for article in articles:
            if article.train:
                self.train_articles.append(
                    article
                )  # keep a list of all articles in the training set.
            else:
                self.test_articles.append(
                    article)  # keep a list of all articles in the testing set.

        for article in articles:
            if kind == 'wordlist':
                features = get_bag_feats(article.wrd_fql, taglist)
            elif kind == 'poslist':
                features = get_bag_feats(remove_ditto(article.pos_fql),
                                         taglist)
            if article.train:  # put feature dict in either testing or training.
                self.train_feats.append(features)
            else:
                self.test_feats.append(features)

        self.feat_names = features.keys()
Ejemplo n.º 9
0
    def __init__(self, articles, kind='bow'):
        Corpus.__init__(self, articles)
        for article in articles:
            if article.train:
                self.train_articles.append(
                    article
                )  # keep a list of all articles in the training set.
            else:
                self.test_articles.append(
                    article)  # keep a list of all articles in the testing set.

        wrd_bag = make_bag([a.wrd_fql for a in self.train_articles])
        pos_bag = make_bag([a.pos_fql for a in self.train_articles])
        sem_bag = make_bag([a.sem_fql for a in self.train_articles])

        for article in articles:
            features = get_bag_of_x(article, wrd_bag, pos_bag, sem_bag, kind)
            if article.train:  # put feature dict in either testing or training.
                self.train_feats.append(features)
            else:
                self.test_feats.append(features)

        self.feat_names = features.keys()
Ejemplo n.º 10
0
 def __init__(self, path):
     self.path = path
     kwargs = {'print_info': False, 'level': 's'}
     Corpus.__init__(self, self.path, **kwargs)
Ejemplo n.º 11
0
 def __init__(self, path, datatype):
     self.path = path
     kwargs = {'print_info': False, 'level': 's', 'datatype': datatype}
     Corpus.__init__(self, self.path, **kwargs)
Ejemplo n.º 12
0
 def __init__(self, path):
     Corpus.__init__(self, path)
     self.path = path
     self.truth_dict = read_classification_from_file(self.path +
                                                     "/!truth.txt")
Ejemplo n.º 13
0
 def __init__(self, path_to_mails):
     Corpus.__init__(self, path_to_mails)
 def __init__(self,src_db,dst_db,dictfile=None,nltk_data_path=None):
     self.src_db = src_db
     Corpus.__init__(self,dst_db,dictfile,nltk_data_path)
Ejemplo n.º 15
0
 def __init__(self, path, datatype):
     self.path = path
     kwargs = {'print_info': False, 'level': 's', 'datatype': datatype}
     Corpus.__init__(self, self.path, **kwargs)
Ejemplo n.º 16
0
 def __init__(self,src_file,dst_db,dictfile=None):
     self.src_file = src_file
     Corpus.__init__(self,dst_db,dictfile)
Ejemplo n.º 17
0
 def __init__(self, path, datatype):
     self.path = path
     kwargs = {"print_info": False, "level": "s", "datatype": datatype}
     Corpus.__init__(self, self.path, **kwargs)
Ejemplo n.º 18
0
 def __init__(self, path, preprocess=None, max_len=None):
     Corpus.__init__(self, 'quora', path, preprocess=None, max_len=None)
     self.load()
Ejemplo n.º 19
0
 def __init__(self, path, preprocess=None, max_len=None):
     Corpus.__init__(self, 'microsoft', path, preprocess=None, max_len=None)
     self.load()
 def __init__(self, src_db, dst_db, dictfile=None, nltk_data_path=None):
     self.src_db = src_db
     Corpus.__init__(self, dst_db, dictfile, nltk_data_path)