def __init__(self): Corpus.__init__(self) self.scslabels = [] self.cslabel_count = Counter() self.cstype_count = Counter() self.lang_pair_count = Counter() self.multilingual_sentences = set()
def __init__(self, path, dirname, datatype): import os from os.path import join, isfile, isdir self.path = join(dirname, path) kwargs = {'print_info': False, 'level': 'f', 'datatype': datatype} Corpus.__init__(self, self.path, **kwargs) if self.path.endswith('.p'): self.datatype = 'tokens' elif self.path.endswith('.xml'): self.datatype = 'parse' else: self.datatype = 'plaintext'
def __init__(self, path, dirname): import os from os.path import join, isfile, isdir self.path = join(dirname, path) kwargs = {'print_info': False, 'level': 'f'} Corpus.__init__(self, self.path, **kwargs) if self.path.endswith('.p'): self.datatype = 'tokens' elif self.path.endswith('.xml'): self.datatype = 'parse' else: self.datatype = 'plaintext'
def __init__(self, path, dirname, datatype): import os from os.path import join, isfile, isdir self.path = join(dirname, path) kwargs = {"print_info": False, "level": "f", "datatype": datatype} Corpus.__init__(self, self.path, **kwargs) if self.path.endswith(".p"): self.datatype = "tokens" elif self.path.endswith(".xml"): self.datatype = "parse" else: self.datatype = "plaintext"
def __init__(self, char_dictionary=(None, None), label_dictionary=(None, None)): """Reads in a corpus file and sets the corpus variables. Keyword arguments: char_dictionary -- A tuple of dictionaries for characters to indices and indices to characters label_dictionary -- A tuple of dictionaries for labels to indices and indices to labels """ label2idx = ({'<PAD>':0, 'lang1': 1, 'lang2':2, 'other':3, 'ne':4, 'ambiguous':5, 'fw':6, 'mixed':7, 'unk':8}) idx2label = {i:l for l, i in self.label2idx.items()} Corpus.__init__(self, label_dictionary=(label2idx, idx2label))
def __init__(self, articles): Corpus.__init__(self, articles) for article in articles: features = get_features( article) # Get the feature values for the current article. if article.train: # put feature dict in either testing or training. self.train_feats.append(features) self.train_articles.append( article ) # keep a list of all articles in the training set. else: self.test_feats.append(features) self.test_articles.append( article) # keep a list of all articles in the testing set. self.feat_names = features.keys()
def __init__(self, dictionary=None, corpus=None, index_file=None, max_docs=None, **kwargs): Corpus.__init__(self, dictionary=dictionary, corpus=corpus) self.clip_corpus(max_docs) # Set up for KNN features = len(self.dictionary) self.index = AnnoyIndex(features) start_time = datetime.datetime.now() if not index_file: self.transform_corpus(models.TfidfModel) for i, vector in enumerate(self): self.index.add_item(i, list(sparse2full(vector, features).astype(float))) self.index.build(self.no_trees) else: self.index.load(index_file) end_time = datetime.datetime.now() self.train_time = end_time - start_time return
def __init__(self, articles, taglist, kind="wordlist"): Corpus.__init__(self, articles) for article in articles: if article.train: self.train_articles.append( article ) # keep a list of all articles in the training set. else: self.test_articles.append( article) # keep a list of all articles in the testing set. for article in articles: if kind == 'wordlist': features = get_bag_feats(article.wrd_fql, taglist) elif kind == 'poslist': features = get_bag_feats(remove_ditto(article.pos_fql), taglist) if article.train: # put feature dict in either testing or training. self.train_feats.append(features) else: self.test_feats.append(features) self.feat_names = features.keys()
def __init__(self, articles, kind='bow'): Corpus.__init__(self, articles) for article in articles: if article.train: self.train_articles.append( article ) # keep a list of all articles in the training set. else: self.test_articles.append( article) # keep a list of all articles in the testing set. wrd_bag = make_bag([a.wrd_fql for a in self.train_articles]) pos_bag = make_bag([a.pos_fql for a in self.train_articles]) sem_bag = make_bag([a.sem_fql for a in self.train_articles]) for article in articles: features = get_bag_of_x(article, wrd_bag, pos_bag, sem_bag, kind) if article.train: # put feature dict in either testing or training. self.train_feats.append(features) else: self.test_feats.append(features) self.feat_names = features.keys()
def __init__(self, path): self.path = path kwargs = {'print_info': False, 'level': 's'} Corpus.__init__(self, self.path, **kwargs)
def __init__(self, path, datatype): self.path = path kwargs = {'print_info': False, 'level': 's', 'datatype': datatype} Corpus.__init__(self, self.path, **kwargs)
def __init__(self, path): Corpus.__init__(self, path) self.path = path self.truth_dict = read_classification_from_file(self.path + "/!truth.txt")
def __init__(self, path_to_mails): Corpus.__init__(self, path_to_mails)
def __init__(self,src_db,dst_db,dictfile=None,nltk_data_path=None): self.src_db = src_db Corpus.__init__(self,dst_db,dictfile,nltk_data_path)
def __init__(self,src_file,dst_db,dictfile=None): self.src_file = src_file Corpus.__init__(self,dst_db,dictfile)
def __init__(self, path, datatype): self.path = path kwargs = {"print_info": False, "level": "s", "datatype": datatype} Corpus.__init__(self, self.path, **kwargs)
def __init__(self, path, preprocess=None, max_len=None): Corpus.__init__(self, 'quora', path, preprocess=None, max_len=None) self.load()
def __init__(self, path, preprocess=None, max_len=None): Corpus.__init__(self, 'microsoft', path, preprocess=None, max_len=None) self.load()
def __init__(self, src_db, dst_db, dictfile=None, nltk_data_path=None): self.src_db = src_db Corpus.__init__(self, dst_db, dictfile, nltk_data_path)