def test_empty(self): r = ReplaceText([]) Z = r.transform([]) self.assertEqual(len(Z), 0) X = ["Deadmau5 4x4 = 12"] r = ReplaceText([]) Z = r.transform(X) self.assertEqual(list(Z), X)
def test_priority_is_accounted(self): X = ["What ' is ' what should n't be and what ' will be '"] Y = ["What is what should not be and what will be "] r = ReplaceText([ ("n't", "not"), ("'", ""), ]) Z = r.transform(X) self.assertEqual(Z, Y)
def test_simple(self): X = ["Sentence number one number two and so on .", "Old ubuntu version is 12.04, but it's still mantained"] Y = ["Sentence number one number two and so on ", "Old ubuntu version is 1204, but it is still mantained"] r = ReplaceText([ (".", ""), ("'s", " is"), ]) Z = r.transform(X) self.assertEqual(Z, Y)
def test_simple(self): X = [ "Sentence number one number two and so on .", "Old ubuntu version is 12.04, but it's still mantained" ] Y = [ "Sentence number one number two and so on ", "Old ubuntu version is 1204, but it is still mantained" ] r = ReplaceText([ (".", ""), ("'s", " is"), ]) Z = r.transform(X) self.assertEqual(Z, Y)
def test_fit_returns_self(self): r = ReplaceText([]) s = r.fit([]) self.assertEqual(s, r)
def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, text_replacements=None, map_to_synsets=False, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=False, duplicates=False): """ Parameter description: - `classifier`: The type of classifier used as main classifier, valid values are "sgd", "knn", "svc", "randomforest". - `classifier_args`: A dict to be passed as arguments to the main classifier. - `lowercase`: wheter or not all words are lowercased at the start of the pipeline. - `text_replacements`: A list of tuples `(from, to)` specifying string replacements to be made at the start of the pipeline (after lowercasing). - `map_to_synsets`: Whether or not to use the Wordnet synsets feature set. - `binary`: Whether or not to count words in the bag-of-words representation as 0 or 1. - `min_df`: Minumim frequency a word needs to have to be included in the bag-of-word representation. - `ngram`: The maximum size of ngrams to be considered in the bag-of-words representation. - `stopwords`: A list of words to filter out of the bag-of-words representation. Can also be the string "english", in which case a default list of english stopwords will be used. - `limit_train`: The maximum amount of training samples to give to the main classifier. This can be useful for some slow main classifiers (ex: svc) that converge with less samples to an optimum. - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon features. - `duplicates`: Whether or not to check for identical phrases between train and prediction. """ self.limit_train = limit_train self.duplicates = duplicates # Build pre-processing common to every extraction pipeline = [ExtractText(lowercase)] if text_replacements: pipeline.append(ReplaceText(text_replacements)) # Build feature extraction schemes ext = [build_text_extraction(binary=binary, min_df=min_df, ngram=ngram, stopwords=stopwords)] if map_to_synsets: ext.append(build_synset_extraction(binary=binary, min_df=min_df, ngram=ngram)) if map_to_lex: ext.append(build_lex_extraction(binary=binary, min_df=min_df, ngram=ngram)) ext = make_union(*ext) pipeline.append(ext) # Build classifier and put everything togheter if classifier_args is None: classifier_args = {} classifier = _valid_classifiers[classifier](**classifier_args) self.pipeline = make_pipeline(*pipeline) self.classifier = classifier
def __init__(self, classifier="sgd", classifier_args=None, lowercase=True, text_replacements=None, map_to_synsets=True, binary=False, min_df=0, ngram=1, stopwords=None, limit_train=None, map_to_lex=True, duplicates=True, svm_features=False, preprocessor=False, useLemmatization=True, stemming=False, useStopWords=True, word2vecFeatures=False, splitModel=False, useTfIdf=False): """ Parameter description: - `classifier`: The type of classifier used as main classifier, valid values are "sgd", "knn", "svc", "randomforest". - `classifier_args`: A dict to be passed as arguments to the main classifier. - `lowercase`: wheter or not all words are lowercased at the start of the pipeline. - `text_replacements`: A list of tuples `(from, to)` specifying string replacements to be made at the start of the pipeline (after lowercasing). - `map_to_synsets`: Whether or not to use the Wordnet synsets feature set. - `binary`: Whether or not to count words in the bag-of-words representation as 0 or 1. - `min_df`: Minumim frequency a word needs to have to be included in the bag-of-word representation. - `ngram`: The maximum size of ngrams to be considered in the bag-of-words representation. - `stopwords`: A list of words to filter out of the bag-of-words representation. Can also be the string "english", in which case a default list of english stopwords will be used. - `limit_train`: The maximum amount of training samples to give to the main classifier. This can be useful for some slow main classifiers (ex: svc) that converge with less samples to an optimum. - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon features. - `duplicates`: Whether or not to check for identical phrases between train and prediction. - `svm_features`: Whether or not to include features from an SVM classifier """ self.limit_train = limit_train self.duplicates = duplicates print("Using tfidf: ", useTfIdf) # Build pre-processing common to every extraction pipeline = [ Preprocessor(removeStopWords=useStopWords, lemmatize=useLemmatization, stem=stemming) ] if preprocessor else [ExtractText(lowercase)] if text_replacements: pipeline.append(ReplaceText(text_replacements)) # Build feature extraction schemes ext = [ build_text_extraction(binary=binary, min_df=min_df, ngram=ngram, stopwords=stopwords, useTfIdf=useTfIdf) ] if map_to_synsets: ext.append( build_synset_extraction(binary=binary, min_df=min_df, ngram=ngram, useTfIdf=useTfIdf)) if map_to_lex: ext.append( build_lex_extraction(binary=binary, min_df=min_df, ngram=ngram)) if svm_features: ext.append(build_svm_features()) if word2vecFeatures: ext.append(build_word2vec_features()) ext = make_union(*ext) pipeline.append(ext) # Build classifier and put everything togheter if classifier_args is None: classifier_args = {} if classifier == "ensemble": classifier_args = { "classifiers": [ SGDClassifier(), RandomForestClassifier(), SVC(), KNeighborsClassifier(), RandomForestClassifier(n_estimators=100, min_samples_leaf=10, n_jobs=-1) ] } #Classifier constructor E.g. SGDClassifier(args) classifier = _valid_classifiers[classifier](**classifier_args) self.pipeline = make_pipeline(*pipeline) self.classifier = classifier self.splitModel = splitModel self.splitSize = 1