def setCorpus(self, path, token_pattern="\S+", read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): opts = options.copy() if "tokenPattern" not in opts: opts["tokenPattern"] = token_pattern return self._set(corpus=ExternalResource(path, read_as, opts))
def setExternalRules(self, path, delimiter, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): opts = options.copy() if "delimiter" not in opts: opts["delimiter"] = delimiter return self._set(externalRules=ExternalResource(path, read_as, opts))
def setDictionary(self, path, delimiter, read_as=ReadAs.LINE_BY_LINE, options={'format': 'text'}): opts = options.copy() if "delimiter" not in opts: opts["delimiter"] = delimiter return self._set(dictionary=ExternalResource(path, read_as, opts))
def readDataset(self, path, read_as=ReadAs.LINE_BY_LINE, opts={}): resource = ExternalResource(path, read_as, opts) # ToDo Replace with std pyspark session = SparkSession(self.sc) jSession = session._jsparkSession jdf = self._java_obj.readDataset(resource, jSession) return DataFrame(jdf, session._wrapped)
def setCorpus(self, path, delimiter, read_as=ReadAs.SPARK_DATASET, options={ "format": "text", "repartition": "8" }): opts = options.copy() opts["delimiter"] = delimiter return self._set(corpus=ExternalResource(path, read_as, opts))
def __init__(self): super(NorvigSweetingApproach, self).__init__( classname= "com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach" ) self._setDefault(dictionary=ExternalResource( "/spell/words.txt", ReadAs.LINE_BY_LINE, {"tokenPattern": "[a-zA-Z]+"}), caseSensitive=False, doubleVariants=False, shortCircuit=False)
def __init__(self): super(PerceptronApproach, self).__init__( classname= "com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach" ) self._setDefault(corpus=ExternalResource("/anc-pos-corpus/", ReadAs.LINE_BY_LINE, { "delimiter": "|", "format": "text" }), nIterations=5)
def setDictionary(self, path, key_delimiter, value_delimiter, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): opts = options.copy() if "keyDelimiter" not in opts: opts["keyDelimiter"] = key_delimiter if "valueDelimiter" not in opts: opts["valueDelimiter"] = value_delimiter return self._set(dictionary=ExternalResource(path, read_as, opts))
def setExternalDataset(self, path, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): return self._set( externalDataset=ExternalResource(path, read_as, options.copy()))
def setExternalDataset(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "delimiter": ":"}.copy()): return self._set(externalDataset=ExternalResource(path, read_as, options))
def setSlangDictionary(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "tokenPattern": "\S+"}.copy()): return self._set(slangDictionary=ExternalResource(path, read_as, options))
def setCorpus(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "tokenPattern": "\S+"}.copy()): return self._set(corpus=ExternalResource(path, read_as, options))
def setDictionary(self, path=None, read_as="LINE_BY_LINE", options={'format':'text', 'delimiter':','}.copy()): return self._set(dictionary=ExternalResource(path, read_as, options))
def setEntities(self, path=None, read_as="LINE_BY_LINE", options={"format": "text"}.copy()): return self._set(entities=ExternalResource(path, read_as, options))
def setDictionary(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "keyDelimiter": "->", "valueDelimiter": "\t"}.copy()): return self._set(dictionary=ExternalResource(path, read_as, options))
def setCorpus(self, path, delimiter, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): opts = options.copy() opts["delimiter"] = delimiter return self._set(corpus=ExternalResource(path, read_as, opts))