Esempio n. 1
0
    def __init__(self, rep_type: str = 'synonym', use_ner: bool = True):
        """Instantiate a ReplaceTerms object

        Parameters
        ----------
        rep_type : Optional(str)
            The type of target perturbation. May include `synonym` for word2vec replacement, `mlmsynonym` for MLM-based replacement, or `misspelling` for misspelling replacement.
        use_ner : Optional(bool)
            Flag specifying whether to use entity-aware replacement. If True, when calculating the sampling weights for any perturbation, named entities will be zeroed. In this case, the NER model is loaded here. The default value is True.
        """
        self.use_ner = use_ner if SPARK_NLP_ENABLED else False
        self.rep_type = rep_type
        if rep_type not in ['synonym', 'misspelling', 'mlmsynonym']:
            logger.error('{}:ReplaceTerms __init__ invalid rep_type'.format(
                __file__.split('/')[-1]))
            raise ValueError('Not an accepted generator type')
        self._generator = self._get_generator(rep_type)
        if not self._generator:
            raise RuntimeError('Unable to init generator')
        if self.use_ner:
            try:
                spark = sparknlp.start()
                self._ner_pipeline = PretrainedPipeline(
                    'recognize_entities_dl', lang='en')
            except Exception as e:
                logger.error(
                    '{}:ReplaceTerms __init__ invalid rep_type'.format(
                        __file__.split('/')[-1]))
                raise RuntimeError('Unable to load ner pkg')
Esempio n. 2
0
def test_API():
    from sparknlp.pretrained import PretrainedPipeline

    ner_pipeline = PretrainedPipeline("ner_model_finder", "en",
                                      "clinical/models")

    result = ner_pipeline.annotate("medication")
    print(result)
    return result
Esempio n. 3
0
def main():
    spark, sc = init_spark()

    # Download a pre-trained pipeline
    pipeline = PretrainedPipeline('explain_document_dl', lang='en')

    # Your testing dataset
    text = """
    The Mona Lisa is a 16th century oil painting created by Leonardo.
    It's held at the Louvre in Paris.
    """

    # Annotate your testing dataset
    result = pipeline.annotate(text)

    # What's in the pipeline
    print(list(result.keys()))
    print(result['entities'])
Esempio n. 4
0
def construct_component_from_pipe_identifier(language, sparknlp_reference):
    '''
    # creates a list of components from a Spark NLP Pipeline reference
    # 1. download pipeline
    # 2. unpack pipeline to annotators and create list of nlu components
    # 3. return list of nlu components
    :param language: language of the pipeline
    :param sparknlp_reference: Reference to a spark nlp petrained pipeline
    :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list
    '''
    logger.info("Starting Spark NLP to NLU pipeline conversion process")
    from sparknlp.pretrained import PretrainedPipeline
    if 'language' in sparknlp_reference : language='xx' #special edge case for lang detectors
    pipe = PretrainedPipeline(sparknlp_reference, lang=language)
    constructed_components = []
    for component in pipe.light_model.pipeline_model.stages:
        logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component)
        parsed=''
        parsed = str(component).split('_')[0].lower()
        logger.info("Parsed Component for : %s", parsed)
        
        if 'NerConverter' in  component.name : constructed_components.append(Util(component_name='ner_converter', model=component)) 
        elif parsed == 'match': constructed_components.append(nlu.Matcher(model=component)) 
        elif parsed == 'document': constructed_components.append(nlu.Util(model=component)) 
        elif parsed == 'sentence': constructed_components.append(nlu.Util(component_name='sentence_detector',model=component)) # todo differentiate normal and deep detector
        elif parsed == 'regex': constructed_components.append(nlu.Matcher(component_name='regex', model=component))
        elif parsed == 'text': constructed_components.append(nlu.Matcher(model=component))
        elif parsed == 'spell': constructed_components.append(nlu.SpellChecker(model=component))
        elif parsed == 'lemmatizer': constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component))
        elif parsed == 'normalizer': constructed_components.append(nlu.lemmatizer.Normalizer(model=component))
        elif parsed == 'stemmer': constructed_components.append(nlu.stemmer.Stemmer(model=component))
        elif parsed == 'pos' or parsed =='language': constructed_components.append(nlu.Classifier(model=component))
        elif parsed == 'word': constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'ner' or  parsed == 'nerdlmodel': constructed_components.append(nlu.Classifier(component_name='ner',model=component))
        elif parsed == 'dependency': constructed_components.append(nlu.Util(model=component))
        elif parsed == 'typed': constructed_components.append(nlu.Util(model=component)) # todo util abuse
        elif parsed == 'multi': constructed_components.append(nlu.Util(model=component)) # todo util abuse 
        elif parsed == 'sentimentdlmodel': constructed_components.append(nlu.Classifier(model=component))
        elif parsed in ['universal','bert','albert', 'elmo', 'xlnet', 'glove','electra','covidbert','small_bert','']  : constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'vivekn': constructed_components.append(nlu.Classifier(component_name='vivekn', model=component))
        elif parsed == 'chunker': constructed_components.append(nlu.chunker.Chunker(model=component))
        elif parsed == 'ngram': constructed_components.append(nlu.chunker.Chunker(model=component))
        elif '2e2' in parsed: constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'embeddings_chunk': constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component))
        elif parsed == 'stopwords': constructed_components.append(nlu.StopWordsCleaner(model=component))
        
        logger.info("Extracted into NLU Component type : %s", parsed)
        if None in constructed_components :
            logger.exception("EXCEPTION: Could not infer component type for lang=%s and sparknlp_reference=%s during pipeline conversion,", language,sparknlp_reference)
            return None
    return constructed_components
Esempio n. 5
0
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

# Download a pre-trained pipeline
pipeline = PretrainedPipeline('explain_document_dl', lang='en')

# Your testing dataset
text = """
The Mona Lisa is a 16th century oil painting created by Leonardo. 
It’s held at the Louvre in Paris.
"""

# Annotate your testing dataset
result = pipeline.annotate(text)
# What’s in the pipeline
print(list(result.keys()))
print(result['entities'])
Esempio n. 6
0
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {
    "spark.driver.memory": "16G",
    "spark.kryoserializer.buffer.max": "2000M",
    "spark.driver.maxResultSize": "2000M"
}

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(SECRET, params=params)

from sparknlp.pretrained import PretrainedPipeline

ner_pipeline = PretrainedPipeline("ner_model_finder", "en", "clinical/models")

result = ner_pipeline.annotate("medication")

print(100 * '-')
print(result)
print(100 * '-')
Esempio n. 7
0
##LEMATIZATION
import sparknlp
sparknlp.start()

from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import lower, col
from pyspark.ml.feature import StringIndexer

pipeline = PretrainedPipeline('explain_document_ml', 'en')
s_df2 = pipeline.annotate(s_df, "review_text")
s_df2 = s_df2.drop(
    *["document", "sentence", "token", "spell", "stems", "pos", "text"])


def mkString(line):
    return " ".join([str(x[3]) for x in line])


string_udf = udf(lambda z: mkString(z), StringType())
s_df2 = s_df2.withColumn("lemmatizedText", string_udf("lemmas"))
s_df2 = s_df2.withColumn("lemmatizedText", lower(col("lemmatizedText")))

# define processing 4 steps and execute them with a trsnformation pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml import Pipeline
##LEMATIZATION

# 1. Tokenizer, .setPattern("\\p{L}+") means that it remove accent from words (check it has no impact on the smileys !!!)
tokenizer = RegexTokenizer().setGaps(False)\
Esempio n. 8
0
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref):
    '''
    # creates a list of components from a Spark NLP Pipeline reference
    # 1. download pipeline
    # 2. unpack pipeline to annotators and create list of nlu components
    # 3. return list of nlu components
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp petrained pipeline
    :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list
    '''
    logger.info("Starting Spark NLP to NLU pipeline conversion process")
    from sparknlp.pretrained import PretrainedPipeline
    if 'language' in nlp_ref: language = 'xx'  # special edge case for lang detectors
    pipe = PretrainedPipeline(nlp_ref, lang=language)
    constructed_components = []
    for component in pipe.light_model.pipeline_model.stages:
        logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component)
        parsed = str(component).split('_')[0].lower()
        logger.info("Parsed Component for : %s", parsed)
        c_name = component.__class__.__name__
        if c_name == 'NerConverter':
            constructed_components.append(Util(annotator_class='ner_converter', model=component))
        elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings:
            constructed_components.append(nlu.Embeddings(model=component))
        elif parsed in NameSpace.classifiers:
            constructed_components.append(nlu.Classifier(model=component))
        elif c_name == 'TokenizerModel' and parsed !='regex':
            constructed_components.append(nlu.Tokenizer(model=component))
        elif c_name == 'TokenizerModel':
            constructed_components.append(nlu.Tokenizer(model=component,annotator_class='regex_tokenizer'))
        elif parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component))
        elif parsed == 'document':
            constructed_components.append(nlu.Util(model=component))
        elif parsed == 'sentence':
            constructed_components.append(nlu.Util(annotator_class='sentence_detector', model=component))
        elif parsed == 'regex':
            constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed))
        elif parsed == 'date':
            constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed))
        elif parsed == 'text':
            constructed_components.append(nlu.Matcher(model=component, nlu_ref=parsed))
        elif parsed == 'spell':
            constructed_components.append(nlu.SpellChecker(model=component))
        elif parsed == 'lemmatizer':
            constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component))
        elif parsed == 'normalizer':
            constructed_components.append(nlu.normalizer.Normalizer(model=component))
        elif parsed == 'stemmer':
            constructed_components.append(nlu.stemmer.Stemmer(model=component))
        elif c_name == 'PerceptronModel':
            constructed_components.append(nlu.Classifier(annotator_class='classifierdl', model=component))
        elif c_name == 'ClassifierDLModel':
            constructed_components.append(nlu.Classifier(annotator_class='language_detector', model=component))

        elif parsed == 'word':
            constructed_components.append(nlu.Embeddings(model=component))
        elif parsed == 'ner' or parsed == 'nerdlmodel':
            constructed_components.append(nlu.Classifier(model=component))
        elif parsed == 'dependency':
            constructed_components.append(nlu.Util(model=component))
        elif parsed == 'typed':
            constructed_components.append(nlu.UnlabledDepParser(model=component))
        elif parsed == 'multi':
            constructed_components.append(nlu.Classifier(model=component))
        elif parsed == 'sentimentdlmodel':
            constructed_components.append(nlu.Classifier(model=component))

        elif parsed == 'chunker':
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif parsed == 'ngram':
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif parsed == 'embeddings_chunk':
            constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component))
        elif parsed == 'stopwords':
            constructed_components.append(nlu.StopWordsCleaner(model=component))
        else:
            logger.exception(
                "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,",
                language, nlp_ref)
            logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue")
            constructed_components.append(nlu.normalizer.Normalizer(model=component))

        logger.info("Extracted into NLU Component type : %s", parsed)
        if None in constructed_components:
            logger.exception(
                "EXCEPTION: Could not infer component type for lang=%s and nlp_ref=%s during pipeline conversion,",
                language, nlp_ref)
            return None
    return constructed_components
Esempio n. 9
0
df.cache().count()

# COMMAND ----------

display(df.select("text"))

# COMMAND ----------

# MAGIC %md #### Extraction

# COMMAND ----------

import sparknlp 
from sparknlp.pretrained import PretrainedPipeline 

pipeline = PretrainedPipeline('recognize_entities_dl_noncontrib', 'en')
result = pipeline.annotate(df, column = 'text') 

# COMMAND ----------

result.cache().count()

# COMMAND ----------

from pyspark.sql.functions import explode, col
from wordcloud import WordCloud,STOPWORDS

import matplotlib.pyplot as plt

exploded = result.select(explode(col('entities.result')).alias("entities"))
def load_pipeline(name):
    #if name=='match_datetime':
    #return light_Datetime
    #else:
    return PretrainedPipeline(name, lang='en')
Esempio n. 11
0
def construct_component_from_pipe_identifier(
        language,
        nlp_ref,
        nlu_ref,
        path=None,
        is_licensed=False):  # -> NLUPipeline
    """
    creates a list of components from a Spark NLP Pipeline reference
    1. download pipeline
    2. unpack pipeline to annotators and create list of nlu components
    3. return list of nlu components
    :param is_licensed: Weather pipe is licensed or not
    :param nlu_ref: Nlu ref that points to this pipe
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp pretrained pipeline
    :param path: Load component_list from HDD
    :return: Each element of the Spark NLP pipeline wrapped as a NLU component inside a list
    """
    if 'language' in nlp_ref:
        # special edge case for lang detectors
        language = 'xx'
    if path is None:
        if is_licensed:
            pipe = PretrainedPipeline(nlp_ref,
                                      lang=language,
                                      remote_loc='clinical/models')
        else:
            pipe = PretrainedPipeline(nlp_ref, lang=language)
        iterable_stages = pipe.light_model.pipeline_model.stages
    else:
        pipe = LightPipeline(PipelineModel.load(path=path))
        iterable_stages = pipe.pipeline_model.stages
    constructed_components = []
    os_annos = AnnoClassRef.get_os_pyclass_2_anno_id_dict()
    hc_annos = AnnoClassRef.get_hc_pyclass_2_anno_id_dict()
    ocr_annos = AnnoClassRef.get_ocr_pyclass_2_anno_id_dict()
    for jsl_anno_object in iterable_stages:
        anno_class_name = type(jsl_anno_object).__name__
        logger.info(
            f"Extracting model from Spark NLP pipeline: obj= {jsl_anno_object} class_name = {anno_class_name} and creating Component"
        )
        if anno_class_name in os_annos.keys():
            jsl_anno_id = os_annos[anno_class_name]
            nlu_component = ComponentMap.os_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.open_source)
            constructed_components.append(nlu_component)
        elif anno_class_name in hc_annos.keys():
            # Licensed HC
            jsl_anno_id = hc_annos[anno_class_name]
            nlu_component = ComponentMap.hc_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.hc)
            constructed_components.append(nlu_component)
        elif anno_class_name in ocr_annos:
            # Licensed OCR (WIP)
            jsl_anno_id = ocr_annos[anno_class_name]
            nlu_component = ComponentMap.ocr_components[jsl_anno_id]
            nlu_component.set_metadata(jsl_anno_object, nlu_ref, nlp_ref,
                                       language, True, Licenses.ocr)
            constructed_components.append(nlu_component)
        else:
            raise ValueError(
                f'Could not find matching nlu component for annotator class = {anno_class_name}'
            )
        if None in constructed_components or len(constructed_components) == 0:
            raise Exception(
                f"Failure inferring type anno_class={anno_class_name} ")
    return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(
        PipeUtils.set_column_values_on_components_from_pretrained_pipe(
            constructed_components, nlp_ref, language, path))
Esempio n. 12
0
class ReplaceTerms():
    """ A class to generate sentence perturbations by replacement
    ...
    Methods
    ----------
    replace_terms(sentence, importance_scores, num_replacements, num_output_sents, sampling_strategy, sampling_k)
      Generate synonyms for an input term 
    """
    global SPARK_NLP_ENABLED

    def __init__(self, rep_type: str = 'synonym', use_ner: bool = True):
        """Instantiate a ReplaceTerms object

        Parameters
        ----------
        rep_type : Optional(str)
            The type of target perturbation. May include `synonym` for word2vec replacement, `mlmsynonym` for MLM-based replacement, or `misspelling` for misspelling replacement.
        use_ner : Optional(bool)
            Flag specifying whether to use entity-aware replacement. If True, when calculating the sampling weights for any perturbation, named entities will be zeroed. In this case, the NER model is loaded here. The default value is True.
        """
        self.use_ner = use_ner if SPARK_NLP_ENABLED else False
        self.rep_type = rep_type
        if rep_type not in ['synonym', 'misspelling', 'mlmsynonym']:
            logger.error('{}:ReplaceTerms __init__ invalid rep_type'.format(
                __file__.split('/')[-1]))
            raise ValueError('Not an accepted generator type')
        self._generator = self._get_generator(rep_type)
        if not self._generator:
            raise RuntimeError('Unable to init generator')
        if self.use_ner:
            try:
                spark = sparknlp.start()
                self._ner_pipeline = PretrainedPipeline(
                    'recognize_entities_dl', lang='en')
            except Exception as e:
                logger.error(
                    '{}:ReplaceTerms __init__ invalid rep_type'.format(
                        __file__.split('/')[-1]))
                raise RuntimeError('Unable to load ner pkg')

    def _get_entities(self, sentence: str) -> Dict:
        """ Tokenize and annotate sentence """

        if self.use_ner:
            # Use spark-nlp tokenizer for entity-aware mask
            allowed_tags = ['PER', 'LOC', 'ORG', 'MISC']

            # Annotate your testing dataset
            result = self._ner_pipeline.annotate(sentence)
            toks = result['token']
            mask = [
                1 if
                (any([y in x
                      for y in allowed_tags]) or not toks[i].isalnum()) else 0
                for i, x in enumerate(result['ner'])
            ]
        else:
            # Use simple NLTK tokenizer
            toks = nltk.word_tokenize(sentence)
            mask = [0] * len(toks)
        return mask, toks

    def _get_generator(self, name: str = None):
        if name == 'synonym':
            try:
                _syn = SynonymReplace()
                return _syn
            except Exception as e:
                logger.error(
                    '{}:replace_terms: unable to load word vectors'.format(
                        __file__.split('/')[-1]))
        elif name == 'misspelling':
            try:
                _missp = MisspReplace()
                return _missp
            except Exception as e:
                logger.error(
                    '{}:replace_terms: unable to load misspellings'.format(
                        __file__.split('/')[-1]))
        elif name == 'mlmsynonym':
            try:
                _syn = MLMSynonymReplace()
                return _syn
            except Exception as e:
                logger.error(
                    '{}:replace_terms: unable to load word vectors'.format(
                        __file__.split('/')[-1]))
        return

    def replace_terms(self,
                      sentence: str,
                      importance_scores: List = None,
                      num_replacements: int = 1,
                      num_output_sents: int = 1,
                      sampling_strategy: str = 'random',
                      sampling_k: int = None) -> List:
        """Generate a certain number of sentence perturbations by replacement using either misspelling or synonyms

        Parameters
        ----------
        sentence : str
            The input sentence to be perturbed.
        importance_scores : Optional(List)
            List of tuples defining a weight for each term in the tokenized sentence. These weights are used during sampling to influence perturnation probabilities. If None, uniform sampling is used by default.
        num_replacements : Optional(int)
            Target number of terms to replace in the original sentence. The number is chosen randomly using the target as an upper bound, and lower bound of 1. The default is 1.
        num_output_sents : Optional(int)
            Target number of perturbed sentences to generate based on the original sentence. The default is 1.
        sampling_strategy : Optional(str)
            Strategy used to sample terms to perturb in the original sentence. The default is random. If importance_scores is given, then sampling_strategy may be `topK` or `bottomK`, in which case the importance_scores (or inverted scores) vector is used for weighted sampling.
        sampling_k : Optional(int)
            The number of terms in the importance score vector to include in topK or bottomK sampling. This parameter is not used by the default sampling_strategy, `random` sampling.
        Returns
        -------
        [str]
            Returns a list of perturbed sentences for the input sentence.

        Example
        -------
        >>> from term_replacement import ReplaceTerms
        >>> p = ReplaceTerms(rep_type="synonym")
        >>> sent = "I was born in a small town"
        >>> num_terms = 1
        >>> num_output_sents = 1
        >>> p.generate(sent, num_terms, num_output_sents)
        ['I born in a small village']

        >>> from term_replacement import ReplaceTerms
        >>> p = ReplaceTerms(rep_type="misspelling")
        >>> sent = "I was born in a small town"
        >>> num_terms = 1
        >>> num_output_sents = 1
        >>> p.generate(sent, num_terms, num_output_sents)
        ['I born in a smal town']
        """

        inputs = validate_inputs(num_replacements, num_output_sents,
                                 sampling_strategy)
        num_replacements = inputs.pop(0)
        num_output_sents = inputs.pop(0)
        sampling_strategy = inputs.pop(0)

        # Extract entities in the input sentence to mask
        masked_vector, tokens = self._get_entities(sentence)

        # Check if there are enough candidate terms
        if num_replacements > (len(masked_vector) - sum(masked_vector)):
            logger.warning(
                '{}:replace_terms: unable to generate num_replacements - {} of ({})'
                .format(
                    __file__.split('/')[-1], num_replacements,
                    len(masked_vector) - sum(masked_vector)))
            num_replacements = len(masked_vector) - sum(masked_vector)

        if self.rep_type == 'misspelling':
            remove_stop = True
        else:
            remove_stop = False

        # Initialize sampling scores
        importance_scores = get_scores(tokens, sampling_strategy, sampling_k,
                                       importance_scores, remove_stop)

        if not importance_scores:
            return []

        # Add index and mask to importance scores
        term_score_index = [(word[0], i, masked_vector[i])
                            for i, word in enumerate(importance_scores)]

        # Store only scores for later sampling
        importance_scores = [
            x[1] if not masked_vector[i] else 0  # set masked scores to zero
            for i, x in enumerate(importance_scores)
        ]

        # Candidate terms for synonym replacement
        rep_term_indices = [w[1] for w in term_score_index if not w[2]]

        # Create List of Lists of term variants
        generated = {x[0]: None for x in term_score_index}
        generated = {
            x[0]: self._generator.generate(x[0].lower(), 10, **{
                'toks': tokens,
                'token_idx': i
            })
            for i, x in enumerate(term_score_index) if not x[2]
        }

        term_variants = {
            x[0]: generated.get(x[0], []) if
            (i in rep_term_indices and not masked_vector[i]) else []
            for i, x in enumerate(term_score_index)
        }

        # Check if there are enough candidate terms
        if not term_variants:
            logger.warning(
                '{}:replace_terms: unable to generate num_variants - {} of ({})'
                .format(
                    __file__.split('/')[-1], num_replacements,
                    len(term_variants) - sum(masked_vector)))
        else:
            term_variants = {
                k: [x[0].upper() + x[1:] for x in v] if k[0].isupper() else v
                for k, v in term_variants.items()
            }

        # Set scores to zero for all terms w/o synonyms
        importance_scores = [
            x if (term_score_index[i][0] in term_variants
                  and len(term_variants[term_score_index[i][0]]) > 0) else 0
            for i, x in enumerate(importance_scores)
        ]

        # Renormalize
        if sum(importance_scores) == 0:
            return []  # avoid division by 0 error

        importance_scores = [
            x / sum(importance_scores) for x in importance_scores
        ]

        # Resize num_replacements to avoid p-sampling errors
        nonzero_entries = sum([x > 0. for x in importance_scores])
        if num_replacements > nonzero_entries:
            num_replacements = nonzero_entries
        '''
        # DEBUG
        # Create a List of Lists of all variants
        candidate_variants = [
            v+[k]
            for k,v in term_variants.items()
        ]

        # Check the total number of variants
        candidate_sents = list(
            itertools.product(*candidate_variants)
        )

        # Set number of output variants to the total possible
        if len(candidate_sents) < num_output_sents:
            num_output_sents = len(candidate_sents)
        '''
        if not term_variants or len([x[2] == 0
                                     for x in term_score_index]) == 0:
            raise Exception('no term variants or term_score_index')

        max_attempts = 50
        counter = 0
        new_sentences = set()
        while len(new_sentences) < num_output_sents:
            if counter > max_attempts:
                break

            # Select terms to replace
            rnd_indices = np.random.choice(len(term_score_index),
                                           size=num_replacements,
                                           replace=False,
                                           p=importance_scores)
            replace_terms = [term_score_index[i][0] for i in rnd_indices]

            # Create List of Lists of term variants
            term_combinations = [
                term_variants.get(x[0], [x[0]])
                if x[0] in replace_terms else [x[0]]
                for i, x in enumerate(term_score_index)
            ]

            # Generate combinatorial variants
            candidate_sents = list(itertools.product(*term_combinations))

            for sent in candidate_sents:
                new_sentences.add(' '.join(sent))
            counter += 1

        # Shuffle permutations, sanitize and slice
        new_sentences = list(new_sentences)
        random.shuffle(new_sentences)
        new_sentences = [
            re.sub(r'([A-Za-z0-9])(\s+)([^A-Za-z0-9])', r'\1\3',
                   x.replace('\' s ', '\'s '))
            for x in new_sentences[:num_output_sents]
        ]
        new_sentences = [x for x in new_sentences if x != sentence]

        if len(new_sentences) < num_output_sents:
            logger.debug(
                '{}:replace_terms: unable to generate num_output_sents - {} of ({})'
                .format(
                    __file__.split('/')[-1], len(new_sentences),
                    num_output_sents))
        return new_sentences
    text = re.sub('#', '', text)  # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text)  # Removing RT
    text = re.sub('https?:\/\/\S+', '', text)  # Removing hyperlink
    text = re.sub(':', '', text)  #remove colon
    text = re.sub('(\.|\!|\?|\,)', '', text)  #remove punctutations
    return text


udf_fun = udf(lambda text: cleantext(text), StringType())
preprocessed_text = twitter_df.select('id',
                                      udf_fun('text').alias('text'), 'user')

preprocessed_text.show()

#use pipeline
pipeline = PretrainedPipeline("analyze_sentiment")
result = pipeline.annotate(preprocessed_text, column='text')
#result.select("sentiment.result").show()

#write result to mongodb

cols = ['id', 'text', 'sentiment.result', 'user']
output = result.select(cols)
#output.show()


output.write\
    .format("com.mongodb.spark.sql.DefaultSource") \
    .mode("append") \
    .option("collection", "sentiment_predicted") \
    .save()
Esempio n. 14
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import MapType, StringType
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
import pandas as pd
import matplotlib.pyplot as plt
import re

bp = PretrainedPipeline.from_disk('Explain_document_dl_en')
spark.version

dir(sparknlp.base)

#!pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1
#!pip install spark-nlp

#CREATE SPARK SESSION

spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

spark = SparkSession \
# Load from Kinesis Stream
rawData = spark\
    .readStream\
    .format("kinesis")\
    .option("streamName", "tech-trends-stream")\
    .option("endpointUrl", "https://kinesis.eu-west-2.amazonaws.com")\
    .load()

tweetSchema = StructType() \
    .add("text", StringType()) \
    .add("hashtags", ArrayType(StringType())) \

# Extract JSON data from Kinesis message
tweets = rawData \
    .selectExpr("cast (data as STRING) jsonData") \
    .select(from_json("jsonData", tweetSchema).alias("tweets")) \
    .select('tweets.text')

# Load Pipeline and Transform for Sentiment
pipeline = PretrainedPipeline("analyze_sentiment", lang="en")
sentiments = pipeline.transform(tweets)

result = sentiments.select('text', 'sentiment')

# Write to JSON in S3
query = sentiments.writeStream\
    .format("json")\
    .option("path", "s3a://tech-trends-output/sentiments")\
    .option("checkpointLocation", "s3a://tech-trends-output/sentiments/checkpoint")\
    .start()
Esempio n. 16
0
def _clean_sent_pipeline (data_ip,input_col, import_c=True):
  print(f"\t\t\t---- Starting the pipeline built for >>> {input_col} <<< with import condition {import_c} ----")
  from pyspark.sql import functions as F
  data=data_ip
  from pyspark.sql.types import IntegerType
  data= data.withColumn("_c0", data["_c0"].cast(IntegerType()))
  text_col = input_col
  non_null_index = (data.filter(data[text_col].isNotNull())).select('_c0')

  text_clean = data.select(text_col).filter(F.col(text_col).isNotNull())
  print(f"\n\t1. Cleaning the input for Null {data.count()} to {data.count()-non_null_index.count()}")

  if import_c: from sparknlp.base import DocumentAssembler
  documentAssembler = sparknlp.base.DocumentAssembler().setInputCol(text_col).setOutputCol('document')
  print(f"\n\t2. Attaching DocumentAssembler Transformer to the pipeline")

  if import_c: from sparknlp.annotator import Tokenizer
  tokenizer = sparknlp.annotator.Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
  print(f"\n\t3. Attaching Tokenizer Annotator to the pipeline")

  if import_c: from sparknlp.annotator import Normalizer
  normalizer = sparknlp.annotator.Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True)
  print(f"\n\t4. Attaching Normalizer Annotator to the pipeline")

  if import_c: from sparknlp.annotator import LemmatizerModel
  lemmatizer = sparknlp.annotator.LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
  print(f"\n\t5. Attaching LemmatizerModel Annotator to the pipeline")

  if import_c: 
    import nltk
    nltk.download("popular")
  from nltk.corpus import stopwords
  eng_stopwords = stopwords.words('english')
  print(f"\n\t6. nltk stop-words found")

  if import_c: from sparknlp.annotator import StopWordsCleaner
  stopwords_cleaner = sparknlp.annotator.StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords)
  print(f"\n\t7. Attaching StopWordsCleaner Annotator to the pipeline")

  if import_c: from sparknlp.annotator import NGramGenerator
  ngrammer = sparknlp.annotator.NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')
  print(f"\n\t8. Attaching NGramGenerator Annotator to the pipeline")
  

  if import_c: from sparknlp.annotator import PerceptronModel
  pos_tagger = sparknlp.annotator.PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos')
  print(f"\n\t9. Attaching PerceptronModel Annotator to the pipeline")

  if import_c: from sparknlp.base import Finisher
  finisher = sparknlp.base.Finisher().setInputCols(['unigrams', 'ngrams','pos'])
  print(f"\n\t10. Attaching Finisher Transformer to the pipeline")

  from pyspark.ml import Pipeline
  pipeline = Pipeline().setStages([documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,
                                  stopwords_cleaner,
                                  pos_tagger,
                                  ngrammer,
                                  finisher])
  print("\n\t\t\t ---- Pipeline Built Successfully ----")

  processed_tweets = pipeline.fit(text_clean).transform(text_clean)
  print("\n\t\t\t ---- Pipeline Fitted Successfully ----")

  from pyspark.sql.functions import concat
  processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams')))
  print("\n\tData Concatination done - uni--ngrams")

  print("\n\t\t\t ---- Loading the Pre-trained Pipeline  analyze_sentimentdl_use_twitter----")

  from sparknlp.pretrained import PretrainedPipeline
  pipeline_sent = PretrainedPipeline("analyze_sentimentdl_use_twitter", lang="en")

  pipout_sent_results = pipeline_sent.transform(processed_tweets.withColumnRenamed(text_col, "text"))

  print("\n\t\t\t ---- Sentiments Fetched Successfully ----\n\n\n")

  from pyspark.sql.functions import col
  from pyspark.sql.functions import monotonically_increasing_id, row_number
  from pyspark.sql.window import Window
  pipout_sent_results=pipout_sent_results.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id())))
  non_null_index=non_null_index.withColumn("id_tmp",row_number().over(Window.orderBy(monotonically_increasing_id())))

  print("\n$$$ Indexing done for the Compiled Result")

  data_op=data.join(non_null_index.join(pipout_sent_results, on=["id_tmp"]).drop("id_tmp"), on=["_c0"], how='left_outer')
  data_op=data_op.withColumn("_c0", data_op["_c0"].cast(IntegerType()))

  print("\n$$$ Joining the final resutls with original dataframe") #f**k<<catch this

  print(f"\nOriginal IP={data.count()} \nNonNull Index={non_null_index.count()} \nNull_Clean={text_clean.count()} \nOriginal OP={data_op.count()}")
  print(data.show(4))
  #print("\t\t\t\t\t CONVERTED TO THIS")
  final_results = data_op.orderBy("_c0")
  print("\n$$$ Spark Created")


  id = list((((final_results.select('str_id')).toPandas())).str_id)
  createdat = list((((final_results.select('created_at')).toPandas())).created_at)
  fulltext = list((((final_results.select('full_text')).toPandas())).full_text)
  favoritecount = list((((final_results.select('favorite_count')).toPandas())).favorite_count)
  retweetcount = list((((final_results.select('retweet_count')).toPandas())).retweet_count)
  pipeclean = list((((final_results.select('text')).toPandas())).text)
  textlen = list(((final_results.select('finished_unigrams')).toPandas()).finished_unigrams.apply(lambda row: int(len(row))))
  sentscores = list(((final_results.select('sentiment')).toPandas()).sentiment.apply(lambda row: (((str(row)).split(",")[3]).split("'")[1])))
  op_df = p.DataFrame(list(zip(id,createdat,fulltext,favoritecount,retweetcount,pipeclean,textlen,sentscores)), columns = ['str_id','created_at','text_full','favorite_count','retweet_count','text_pipe_clean','text_length','sentiment_score'])

  print("\n$$$ Pandas Created")
  print(op_df.head(4))

  return op_df 
def load_pipeline(name):
    return PretrainedPipeline(name, lang='en')
Esempio n. 18
0
def construct_component_from_pipe_identifier(language, nlp_ref, nlu_ref,path=None):
    '''
    # creates a list of components from a Spark NLP Pipeline reference
    # 1. download pipeline
    # 2. unpack pipeline to annotators and create list of nlu components
    # 3. return list of nlu components
    :param nlu_ref:
    :param language: language of the pipeline
    :param nlp_ref: Reference to a spark nlp petrained pipeline
    :param path: Load pipe from HDD
    :return: Each element of the SaprkNLP pipeline wrapped as a NLU componed inside of a list
    '''
    logger.info("Starting Spark NLP to NLU pipeline conversion process")
    from sparknlp.pretrained import PretrainedPipeline, LightPipeline
    if 'language' in nlp_ref: language = 'xx'  # special edge case for lang detectors
    if path == None :
        pipe = PretrainedPipeline(nlp_ref, lang=language)
        iterable_stages = pipe.light_model.pipeline_model.stages
    else :
        pipe = LightPipeline(PipelineModel.load(path=path))
        iterable_stages = pipe.pipeline_model.stages
    constructed_components = []

    # for component in pipe.light_model.pipeline_model.stages:
    for component in iterable_stages:

        logger.info("Extracting model from Spark NLP pipeline: %s and creating Component", component)
        parsed = str(component).split('_')[0].lower()
        logger.info("Parsed Component for : %s", parsed)
        c_name = component.__class__.__name__
        if isinstance(component, NerConverter):
            constructed_components.append(Util(annotator_class='ner_converter', model=component))
        elif parsed in NameSpace.word_embeddings + NameSpace.sentence_embeddings:
            constructed_components.append(nlu.Embeddings(model=component))
        elif parsed in NameSpace.classifiers:
            constructed_components.append(nlu.Classifier(model=component))
        elif isinstance(component, MultiClassifierDLModel):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl'))
        elif isinstance(component, PerceptronModel):
            constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component))
        elif isinstance(component, (ClassifierDl,ClassifierDLModel)):
            constructed_components.append(nlu.Classifier(nlp_ref='classifierdl', model=component))
        elif isinstance(component, UniversalSentenceEncoder):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='use'))
        elif isinstance(component, BertEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert'))
        elif isinstance(component, AlbertEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='albert'))
        elif isinstance(component, XlnetEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='xlnet'))
        elif isinstance(component, WordEmbeddingsModel):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='glove'))
        elif isinstance(component, ElmoEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='elmo'))
        elif isinstance(component, BertSentenceEmbeddings):
            constructed_components.append(nlu.Embeddings(model=component, nlp_ref='bert_sentence'))
        elif isinstance(component, UniversalSentenceEncoder):
            constructed_components.append(nlu.Embeddings(model=component, nlu_ref='use'))
        elif isinstance(component, TokenizerModel) and parsed != 'regex':
            constructed_components.append(nlu.Tokenizer(model=component))
        elif isinstance(component, TokenizerModel) and parsed == 'regex' :
            constructed_components.append(nlu.Tokenizer(model=component, annotator_class='regex_tokenizer'))
        elif isinstance(component, DocumentAssembler):
            constructed_components.append(nlu.Util(model=component))
        elif isinstance(component, SentenceDetectorDLModel):
            constructed_components.append(NLUSentenceDetector(annotator_class='deep_sentence_detector', model=component))
        elif isinstance(component, (SentenceDetectorDLModel, SentenceDetector)):
            constructed_components.append(NLUSentenceDetector(annotator_class='pragmatic_sentence_detector', model=component))
        elif isinstance(component, RegexMatcherModel) or parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component, annotator_class='regex'))
        elif isinstance(component, TextMatcherModel):
            constructed_components.append(nlu.Matcher(model=component, annotator_class='text'))
        elif isinstance(component, DateMatcher):
            constructed_components.append(nlu.Matcher(model=component, annotator_class='date'))
        elif isinstance(component, ContextSpellCheckerModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='context'))
        elif isinstance(component, SymmetricDeleteModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='symmetric'))
        elif isinstance(component, NorvigSweetingModel):
            constructed_components.append(nlu.SpellChecker(model=component, annotator_class='norvig'))
        elif isinstance(component, LemmatizerModel):
            constructed_components.append(nlu.lemmatizer.Lemmatizer(model=component))
        elif isinstance(component, NormalizerModel):
            constructed_components.append(nlu.normalizer.Normalizer(model=component))
        elif isinstance(component, Stemmer):
            constructed_components.append(nlu.stemmer.Stemmer(model=component))
        elif isinstance(component, (NerDLModel, NerCrfModel)):
            component.setIncludeConfidence(True) # Pipes dont always extrat confidences, so here we enable all pipes to extract confidences manually
            constructed_components.append(nlu.Classifier(model=component, annotator_class='ner'))
        elif isinstance(component, LanguageDetectorDL):
            constructed_components.append(nlu.Classifier(model=component, annotator_class='language_detector'))

        elif isinstance(component, DependencyParserModel):
            constructed_components.append(UnlabledDepParser(model=component))
        elif isinstance(component, TypedDependencyParserModel):
            constructed_components.append(LabledDepParser(model=component))
        elif isinstance(component, MultiClassifierDLModel):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='multiclassifierdl'))
        elif isinstance(component, (SentimentDetectorModel,SentimentDLModel)):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='sentimentdl'))
        elif isinstance(component, (SentimentDetectorModel,ViveknSentimentModel)):
            constructed_components.append(nlu.Classifier(model=component, nlp_ref='vivekn'))
        elif isinstance(component, Chunker):
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif isinstance(component, NGram):
            constructed_components.append(nlu.chunker.Chunker(model=component))
        elif isinstance(component, ChunkEmbeddings):
            constructed_components.append(embeddings_chunker.EmbeddingsChunker(model=component))
        elif isinstance(component, StopWordsCleaner):
            constructed_components.append(nlu.StopWordsCleaner(model=component))
        elif isinstance(component, (TextMatcherModel, RegexMatcherModel, DateMatcher,MultiDateMatcher)) or parsed == 'match':
            constructed_components.append(nlu.Matcher(model=component))
        elif isinstance(component,(T5Transformer)):
            constructed_components.append(nlu.Seq2Seq(annotator_class='t5', model=component))
        elif isinstance(component,(MarianTransformer)):
            constructed_components.append(nlu.Seq2Seq(annotator_class='marian', model=component))
        else:
            logger.exception(
                f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} and model {component} during pipeline conversion,")
            logger.info("USING DEFAULT ANNOTATOR TYPE Lemmatizer to fix issue")
            constructed_components.append(nlu.normalizer.Normalizer(model=component))

        logger.info(f"Extracted into NLU Component type : {parsed}", )
        if None in constructed_components:
            logger.exception(
                f"EXCEPTION: Could not infer component type for lang={language} and nlp_ref={nlp_ref} during pipeline conversion,")
            return None
    return constructed_components
Esempio n. 19
0
def extract_sentiment(aws_conn_id: str, tweets_path: str, summary_path: str,
                      language: str, **kwargs):
    aws_hook = AwsHook(aws_conn_id=aws_conn_id)
    aws_credentials = aws_hook.get_credentials()

    spark = (
        SparkSession.builder.master("local[*]").appName(
            "Analyse sentiment of given tweets").config(
                "spark.serializer",
                "org.apache.spark.serializer.KryoSerializer").config(
                    "spark.kryoserializer.buffer.max", "1000M")
        # .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.0,"
        #                                "org.apache.hadoop:hadoop-common:3.2.0,"
        #                                "org.apache.hadoop:hadoop-annotations:3.2.0,"
        #                                "org.apache.hadoop:hadoop-auth:3.2.0,"
        #                                "org.apache.hadoop:hadoop-client:3.2.0")
        .config("spark.jars.packages",
                "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5").config(
                    "spark.hadoop.fs.s3a.access.key",
                    aws_credentials.access_key).config(
                        "spark.hadoop.fs.s3a.secret.key",
                        aws_credentials.secret_key).config(
                            "spark.hadoop.fs.s3a.impl",
                            "org.apache.hadoop.fs.s3a.S3AFileSystem").config(
                                "spark.hadoop.fs.s3a.endpoint",
                                "s3-eu-central-1.amazonaws.com").
        config("spark.hadoop.fs.s3a.path.style.access", "true").config(
            "spark.executor.extraJavaOptions",
            "-Dcom.amazonaws.services.s3.enableV4=true").config(
                "spark.driver.extraJavaOptions",
                "-Dcom.amazonaws.services.s3.enableV4=true").getOrCreate())

    year = kwargs['execution_date'].year
    month = kwargs['execution_date'].month
    day = kwargs['execution_date'].day
    tweets_path = f'{tweets_path}/{year:04d}/{month:02d}/{day:02d}/*.jsonl.gz'
    summary_path = f'{summary_path}/{year:04d}-{month:02d}-{day:02d}.jsonl'

    logging.info(f'Reading tweets from: {tweets_path}')
    tweets = spark.read.json(tweets_path)

    english_tweets_only = tweets.select('full_text').where(
        tweets.lang == language)
    original_english_tweets_only = english_tweets_only.where(
        ~english_tweets_only.full_text.startswith('RT @'))

    sentiment_pipeline = PretrainedPipeline('analyze_sentiment', language)
    analysed_tweets = sentiment_pipeline.annotate(original_english_tweets_only,
                                                  column='full_text')

    main_sentiment = udf(lambda col: Counter(col).most_common(1)[0][0],
                         StringType())

    tweets_with_overall_sentiment = (analysed_tweets.withColumn(
        'overall_sentiment',
        main_sentiment(analysed_tweets.sentiment.result)).drop(
            'document', 'sentence', 'token', 'checked'))

    tweets_sentiment_summary = tweets_with_overall_sentiment.groupBy(
        'overall_sentiment').count()

    tweets_sentiment_record = dict(
        tweets_sentiment_summary.rdd.map(
            lambda r: (r['overall_sentiment'], r['count'])).collect())
    tweets_sentiment_record[
        'tweets_sentiment_id'] = f'{year:04d}-{month:02d}-{day:02d}({language})'
    tweets_sentiment_record['year'] = year
    tweets_sentiment_record['month'] = month
    tweets_sentiment_record['day'] = day
    tweets_sentiment_record['language'] = language
    tweets_sentiment_record['positive_count'] = tweets_sentiment_record[
        'positive']
    tweets_sentiment_record['negative_count'] = tweets_sentiment_record[
        'negative']
    tweets_sentiment_record['na_count'] = tweets_sentiment_record['na']
    del tweets_sentiment_record['positive']
    del tweets_sentiment_record['negative']
    del tweets_sentiment_record['na']

    logging.info(
        f'Extracted sentiment summary for {year:04d}-{month:02d}-{day:02d}: {tweets_sentiment_record}'
    )

    tweets_sentiment = spark.createDataFrame([tweets_sentiment_record])
    tweets_sentiment.write.json(summary_path, mode='overwrite')
import sparknlp
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType, StringType
from sparknlp.pretrained import PretrainedPipeline

# 1. Setup
sparknlp.start()
conf = SparkConf().setAppName('parallel-project')
sc = SparkContext.getOrCreate()
spark = SQLContext(sc)

pipeline = PretrainedPipeline('analyze_sentiment', 'en')

# 2. Data Cleansing
# read in data to a DataFrame
comments = spark.read.json('RC_2019-02-28-one-day')
# dummy_data = [["Hello, world!", "/r/soccer"], ["Wow. Simply wow. What an unbelievable pass, inch perfect.", "/r/nba"]]
# comments = sc.parallelize(dummy_data).toDF(['body', 'subreddit'])
comments.printSchema

# Rename 'body' to 'text' for spark-nlp
comments = comments.withColumnRenamed('body', 'text')

# keep only the columns we're interested in
commentsCleaned = comments.select('subreddit', 'text')

# Filter out bad comment data
commentsCleaned = commentsCleaned.filter(commentsCleaned.text != '[deleted]')\
                                 .filter(commentsCleaned.text != '[removed]')\
Esempio n. 21
0
dfSpark.select(['id', 'tokens', 'refined_tokens']).show(10)

# COMMAND ----------

dfSpark.filter(((dfSpark.publication == 'New York Times') |
                (dfSpark.publication == 'Vox')))

# COMMAND ----------

display(dfSpark.groupBy('publication').count())
#display(dfSpark.groupBy('year').count())

# COMMAND ----------

sparknlp.start()
pipeline = PretrainedPipeline('analyze_sentiment', 'en')

# COMMAND ----------

dfSpark = dfSpark.withColumn('Sentimiento', when(rand() > 0.5, 1).otherwise(0))

# COMMAND ----------

dfSpark = dfSpark.withColumn(
    "label", dfSpark.Sentimiento.cast('float')).drop('Sentimiento')

# COMMAND ----------

dfSpark.orderBy(rand()).show(10)

# COMMAND ----------