Example #1
0
 def test_empty(self):
     r = ReplaceText([])
     Z = r.transform([])
     self.assertEqual(len(Z), 0)
     X = ["Deadmau5 4x4 = 12"]
     r = ReplaceText([])
     Z = r.transform(X)
     self.assertEqual(list(Z), X)
Example #2
0
 def test_priority_is_accounted(self):
     X = ["What ' is ' what should n't be and what ' will be '"]
     Y = ["What  is  what should not be and what  will be "]
     r = ReplaceText([
         ("n't", "not"),
         ("'", ""),
     ])
     Z = r.transform(X)
     self.assertEqual(Z, Y)
Example #3
0
 def test_priority_is_accounted(self):
     X = ["What ' is ' what should n't be and what ' will be '"]
     Y = ["What  is  what should not be and what  will be "]
     r = ReplaceText([
         ("n't", "not"),
         ("'", ""),
     ])
     Z = r.transform(X)
     self.assertEqual(Z, Y)
Example #4
0
 def test_simple(self):
     X = ["Sentence number one number two and so on .",
          "Old ubuntu version is 12.04, but it's still mantained"]
     Y = ["Sentence number one number two and so on ",
          "Old ubuntu version is 1204, but it is still mantained"]
     r = ReplaceText([
         (".", ""),
         ("'s", " is"),
     ])
     Z = r.transform(X)
     self.assertEqual(Z, Y)
Example #5
0
 def test_simple(self):
     X = [
         "Sentence number one number two and so on .",
         "Old ubuntu version is 12.04, but it's still mantained"
     ]
     Y = [
         "Sentence number one number two and so on ",
         "Old ubuntu version is 1204, but it is still mantained"
     ]
     r = ReplaceText([
         (".", ""),
         ("'s", " is"),
     ])
     Z = r.transform(X)
     self.assertEqual(Z, Y)
Example #6
0
 def test_empty(self):
     r = ReplaceText([])
     Z = r.transform([])
     self.assertEqual(len(Z), 0)
     X = ["Deadmau5 4x4 = 12"]
     r = ReplaceText([])
     Z = r.transform(X)
     self.assertEqual(list(Z), X)
Example #7
0
 def test_fit_returns_self(self):
     r = ReplaceText([])
     s = r.fit([])
     self.assertEqual(s, r)
Example #8
0
    def __init__(self, classifier="sgd", classifier_args=None, lowercase=True,
                 text_replacements=None, map_to_synsets=False, binary=False,
                 min_df=0, ngram=1, stopwords=None, limit_train=None,
                 map_to_lex=False, duplicates=False):
        """
        Parameter description:
            - `classifier`: The type of classifier used as main classifier,
              valid values are "sgd", "knn", "svc", "randomforest".
            - `classifier_args`: A dict to be passed as arguments to the main
              classifier.
            - `lowercase`: wheter or not all words are lowercased at the start of
              the pipeline.
            - `text_replacements`: A list of tuples `(from, to)` specifying
              string replacements to be made at the start of the pipeline (after
              lowercasing).
            - `map_to_synsets`: Whether or not to use the Wordnet synsets
              feature set.
            - `binary`: Whether or not to count words in the bag-of-words
              representation as 0 or 1.
            - `min_df`: Minumim frequency a word needs to have to be included
              in the bag-of-word representation.
            - `ngram`: The maximum size of ngrams to be considered in the
              bag-of-words representation.
            - `stopwords`: A list of words to filter out of the bag-of-words
              representation. Can also be the string "english", in which case
              a default list of english stopwords will be used.
            - `limit_train`: The maximum amount of training samples to give to
              the main classifier. This can be useful for some slow main
              classifiers (ex: svc) that converge with less samples to an
              optimum.
            - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon
              features.
            - `duplicates`: Whether or not to check for identical phrases between
              train and prediction.
        """
        self.limit_train = limit_train
        self.duplicates = duplicates

        # Build pre-processing common to every extraction
        pipeline = [ExtractText(lowercase)]
        if text_replacements:
            pipeline.append(ReplaceText(text_replacements))

        # Build feature extraction schemes
        ext = [build_text_extraction(binary=binary, min_df=min_df,
                                     ngram=ngram, stopwords=stopwords)]
        if map_to_synsets:
            ext.append(build_synset_extraction(binary=binary, min_df=min_df,
                                               ngram=ngram))
        if map_to_lex:
            ext.append(build_lex_extraction(binary=binary, min_df=min_df,
                                            ngram=ngram))
        ext = make_union(*ext)
        pipeline.append(ext)

        # Build classifier and put everything togheter
        if classifier_args is None:
            classifier_args = {}
        classifier = _valid_classifiers[classifier](**classifier_args)
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = classifier
Example #9
0
 def test_fit_returns_self(self):
     r = ReplaceText([])
     s = r.fit([])
     self.assertEqual(s, r)
    def __init__(self,
                 classifier="sgd",
                 classifier_args=None,
                 lowercase=True,
                 text_replacements=None,
                 map_to_synsets=True,
                 binary=False,
                 min_df=0,
                 ngram=1,
                 stopwords=None,
                 limit_train=None,
                 map_to_lex=True,
                 duplicates=True,
                 svm_features=False,
                 preprocessor=False,
                 useLemmatization=True,
                 stemming=False,
                 useStopWords=True,
                 word2vecFeatures=False,
                 splitModel=False,
                 useTfIdf=False):
        """
        Parameter description:
            - `classifier`: The type of classifier used as main classifier,
              valid values are "sgd", "knn", "svc", "randomforest".
            - `classifier_args`: A dict to be passed as arguments to the main
              classifier.
            - `lowercase`: wheter or not all words are lowercased at the start of
              the pipeline.
            - `text_replacements`: A list of tuples `(from, to)` specifying
              string replacements to be made at the start of the pipeline (after
              lowercasing).
            - `map_to_synsets`: Whether or not to use the Wordnet synsets
              feature set.
            - `binary`: Whether or not to count words in the bag-of-words
              representation as 0 or 1.
            - `min_df`: Minumim frequency a word needs to have to be included
              in the bag-of-word representation.
            - `ngram`: The maximum size of ngrams to be considered in the
              bag-of-words representation.
            - `stopwords`: A list of words to filter out of the bag-of-words
              representation. Can also be the string "english", in which case
              a default list of english stopwords will be used.
            - `limit_train`: The maximum amount of training samples to give to
              the main classifier. This can be useful for some slow main
              classifiers (ex: svc) that converge with less samples to an
              optimum.
            - `max_to_lex`: Whether or not to use the Harvard Inquirer lexicon
              features.
            - `duplicates`: Whether or not to check for identical phrases between
              train and prediction.
            - `svm_features`: Whether or not to include features from an SVM classifier
        """
        self.limit_train = limit_train
        self.duplicates = duplicates
        print("Using tfidf: ", useTfIdf)
        # Build pre-processing common to every extraction
        pipeline = [
            Preprocessor(removeStopWords=useStopWords,
                         lemmatize=useLemmatization,
                         stem=stemming)
        ] if preprocessor else [ExtractText(lowercase)]
        if text_replacements:
            pipeline.append(ReplaceText(text_replacements))

        # Build feature extraction schemes
        ext = [
            build_text_extraction(binary=binary,
                                  min_df=min_df,
                                  ngram=ngram,
                                  stopwords=stopwords,
                                  useTfIdf=useTfIdf)
        ]
        if map_to_synsets:
            ext.append(
                build_synset_extraction(binary=binary,
                                        min_df=min_df,
                                        ngram=ngram,
                                        useTfIdf=useTfIdf))
        if map_to_lex:
            ext.append(
                build_lex_extraction(binary=binary, min_df=min_df,
                                     ngram=ngram))
        if svm_features:
            ext.append(build_svm_features())
        if word2vecFeatures:
            ext.append(build_word2vec_features())
        ext = make_union(*ext)
        pipeline.append(ext)

        # Build classifier and put everything togheter
        if classifier_args is None:
            classifier_args = {}
        if classifier == "ensemble":
            classifier_args = {
                "classifiers": [
                    SGDClassifier(),
                    RandomForestClassifier(),
                    SVC(),
                    KNeighborsClassifier(),
                    RandomForestClassifier(n_estimators=100,
                                           min_samples_leaf=10,
                                           n_jobs=-1)
                ]
            }
        #Classifier constructor E.g. SGDClassifier(args)
        classifier = _valid_classifiers[classifier](**classifier_args)
        self.pipeline = make_pipeline(*pipeline)
        self.classifier = classifier
        self.splitModel = splitModel
        self.splitSize = 1