def featureExtractor(spoken, target, words, sentences, return_feats=False): prefmap = makePrefixLangMapping() feats = {} prefixes = scanForMultipleLanguages(target, words) for pref in prefmap: feats["other_langs({0})".format(pref)] = 0 for prefix in prefixes: feats["other_langs({0}".format(prefix)] = 1 vec = DictVectorizer() pos_vectorized = vec.fit_transform(feats) pos_vectorized.toarray() if return_feats: return feats, vec else: return vec
def returnEntryVersusTarget(self, datalist): '''Some users write in a language that is different from their target language (i.e. if they are practicing a language that they didn't specify that they were learning, or if they are writing an entry in their native language asking someone to translate something for them). This function counts how many of these instances exist in the specified dataset.''' t0 = time() prefmap = makePrefixLangMapping() not_orig_lang = 0 for data in datalist: blob = TextBlob(data[self.ENTRY]) entrylang = blob.detect_language() islang = True for d in data[self.STUDYING].split(): if entrylang not in prefmap: continue if prefmap[entrylang] == d: continue not_orig_lang += 1 print("Took %s seconds" % (time() - t0)) print("Of %s entries, there are %s entries written in a different language than specified" % (len(datalist), not_orig_lang))
def returnEntryVersusTarget(self, datalist): '''Some users write in a language that is different from their target language (i.e. if they are practicing a language that they didn't specify that they were learning, or if they are writing an entry in their native language asking someone to translate something for them). This function counts how many of these instances exist in the specified dataset.''' t0 = time() prefmap = makePrefixLangMapping() not_orig_lang = 0 for data in datalist: blob = TextBlob(data[self.ENTRY]) entrylang = blob.detect_language() islang = True for d in data[self.STUDYING].split(): if entrylang not in prefmap: continue if prefmap[entrylang] == d: continue not_orig_lang += 1 print("Took %s seconds" % (time() - t0)) print( "Of %s entries, there are %s entries written in a different language than specified" % (len(datalist), not_orig_lang))