Ejemplo n.º 1
0
    def __init__(
        self,
        stopwords_folder_path: AnyStr = None,
        use_models: bool = False,
        hashtags_as_token: bool = True,
        batch_size: int = DEFAULT_BATCH_SIZE,
        max_num_characters: int = MAX_NUM_CHARACTERS,
    ):
        """Initialization method for the MultilingualTokenizer class, with optional arguments

        Args:
            stopwords_folder_path (str, optional): Path to a folder with stopword text files (one line per stopword)
                Files should be named "{language_code}.txt" with the code in ISO 639-1 format
            use_models (bool): If True (default), loads spaCy models, which is slower but allows to retrieve
                Part-of-Speech and Entities tags for downstream tasks
            hashtags_as_token (bool): Treat hashtags as one token instead of two
                Default is True, which overrides the spaCy default behavior
            batch_size (int): Number of documents to process in spaCy pipelines
                Default is set by the DEFAULT_BATCH_SIZE class constant
            max_num_characters (int): Maximum number of characters in a single text
                Default is 10 million, higher than spaCy more conservative default at 1 million

        """
        store_attr()
        self.spacy_nlp_dict = {}
        self.tokenized_column = None  # may be changed by tokenize_df
Ejemplo n.º 2
0
    def __init__(self, emailer, messenger, items, confirms=1):
        """Base class for scraping.

        Args:
            emailer (Emailer): emailer to use for alerts
            messenger (Messenger): messenger to use for alerts
            items (list): list of item descriptions
            confirms (int): number of repeating states for a state change
        """

        super().__init__()
        store_attr()

        self.id = str(uuid4())[-12:]
        self.options = Options()
        self.options.headless = True
        self.options.add_argument("start-maximized")
        # self.profile = FirefoxProfile()
        # self.profile.set_preference("dom.disable_beforeunload", True)
        # self.profile.set_preference("browser.tabs.warnOnClose", False)
        self.driver = None
        self.waiter = None

        self.stock_state = {
            x["name"]: {
                "current_state": None,
                "pending_state": [None for _ in range(self.confirms)],
                "excluded": x["exclude"]
            }
            for x in self.items
        }
    def __init__(
        self,
        tokenizer: MultilingualTokenizer,
        token_filters: Set[AnyStr],
        lemmatization: bool = True,
        lowercase: bool = True,
        unicode_normalization: UnicodeNormalization = UnicodeNormalization.
        NONE,
        keep_filtered_tokens: bool = False,
    ):
        """Initialization method for the TextCleaner class, with optional arguments

        Args:
            tokenizer (MultilingualTokenizer): Tokenizer instance to handle the initial tokenization step
            token_filters (set): Set of spaCy token attributes to filter out
                Available token filters are defined in MultilingualTokenizer.DEFAULT_FILTER_TOKEN_ATTRIBUTES
            lemmatization (bool, optional): If True, lemmatize tokens using spaCy lookups data
                Default is True, which simplifies all tokens to their lemma e.g. going -> go, mice -> mouse
            lowercase (bool, optional): If True, convert everything to lowercase after filter and lemmatization steps
                Default is True
            unicode_normalization (UnicodeNormalization, optional): Unicode normalization method (final post-processing)
                Default is not to apply normalization. Beware that it's a more complex topic than it looks.
                Read https://en.wikipedia.org/wiki/Unicode_equivalence if you want to understand more
                TL;DR: human languages are a mess => Unicode is a mess too
            keep_filtered_tokens (bool): If True, store filtered tokens in additional columns in the output dataframe
                Default is False, adding only 1 column, which is the cleaned version of the original text

        """
        store_attr()
        self.output_column_descriptions = (
            self.OUTPUT_COLUMN_DESCRIPTIONS.copy()
        )  # will be changed by `_prepare_df_for_cleaning`
    def __init__(
        self,
        tokenizer: MultilingualTokenizer,
        text_column: AnyStr,
        font_folder_path: AnyStr,
        language: AnyStr = "en",
        language_column: AnyStr = None,
        subchart_column: AnyStr = None,
        max_words: int = DEFAULT_MAX_WORDS,
        color_list: List = DEFAULT_COLOR_LIST,
        font: str = DEFAULT_FONT,
        scale: float = DEFAULT_SCALE,
        margin: float = DEFAULT_MARGIN,
        random_state: int = DEFAULT_RANDOM_STATE,
        figsize: tuple = DEFAULT_FIGSIZE,
        dpi: int = DEFAULT_DPI,
        titlepad: int = DEFAULT_TITLEPAD,
        titlesize: int = DEFAULT_TITLESIZE,
        pad_inches: int = DEFAULT_PAD_INCHES,
        bbox_inches: str = DEFAULT_BBOX_INCHES,
        background_color: str = DEFAULT_BACKGROUND_COLOR,
    ):
        """Initialization method for the WordcloudVisualizer class, with optional arguments etailed above"""

        store_attr()
        random.seed(self.random_state)
        self.language_as_subchart = self.language_column == self.subchart_column
        if self.subchart_column == "order66":
            self.font = "DeathStar.otf"
            self.subchart_column = None
Ejemplo n.º 5
0
 def __init__(
     self,
     ontology_df: pd.DataFrame,
     tag_column: AnyStr,
     category_column: AnyStr,
     keyword_column: AnyStr,
     language: AnyStr,
     lemmatization: bool = False,
     ignore_case: bool = False,
     ignore_diacritics: bool = False,
 ):
     store_attr()
     self._remove_incomplete_rows()
     if self.category_column:
         self._replace_missing_categories()
     # set the punctuation characters to use for sentence splitting
     config = {
         "sentencizer": {
             "punct_chars": Sentencizer.default_punct_chars + ["\n"]
         }
     }
     self.tokenizer = MultilingualTokenizer(
         add_pipe_components=["sentencizer"],
         enable_pipe_components="sentencizer",
         config=config,
     )
     self._matcher_dict = {}
     # Dictionary of spaCy PhraseMatcher objects filled by the _match_no_category method.
     # Unused if we are using EntityRuler (in case there are categories in the Ontology)
     self.column_descriptions = {}
     # Dictionary of new columns to add in the dataframe (key) and their descriptions (value).
     # It is filled by the _format_with_category / _format_no_category methods
     self._use_nfc = self.lemmatization and not self.ignore_diacritics
     # Text will be normalized with NFC if True, with NFD otherwise.
     self._keyword_to_tag = {}
Ejemplo n.º 6
0
    def __init__(self,
                 items,
                 tfms,
                 use_list=None,
                 do_setup=True,
                 split_idx=None,
                 train_setup=True,
                 splits=None,
                 types=None,
                 verbose=False,
                 dl_type=None):
        super().__init__(items, use_list=use_list)
        if dl_type is not None: self._dl_type = dl_type

        #potentially unused
        self.splits = L([slice(None), []] if splits is None else splits).map(
            mask2idxs)
        if isinstance(tfms, TfmdListsX): tfms = tfms.tfms
        if isinstance(tfms, PipelineX): do_setup = False

        # This is relevant, equivalent to PipelineX
        self.tfms = PipelineX(tfms, split_idx=split_idx)

        store_attr('types,split_idx')
        if do_setup:
            pv(f"Setting up {self.tfms}", verbose)
            self.setup(train_setup=train_setup)
 def __init__(
     self,
     api_wrapper: GoogleCloudVisionAPIWrapper,
     input_folder: dataiku.Folder,
     input_df: pd.DataFrame,
     column_prefix: AnyStr = "api",
     input_folder_is_gcs: bool = False,
     input_folder_bucket: AnyStr = "",
     input_folder_root_path: AnyStr = "",
     output_dataset: dataiku.Dataset = None,
     output_folder: dataiku.Folder = None,
     output_folder_is_gcs: bool = False,
     output_folder_bucket: AnyStr = "",
     output_folder_root_path: AnyStr = "",
     api_quota_rate_limit: int = 1800,
     api_quota_period: int = 60,
     batch_support: bool = False,
     batch_size: int = 4,
     parallel_workers: int = 4,
     error_handling: ErrorHandling = ErrorHandling.LOG,
     features: List[Dict] = [{}],
     max_results: int = 10,
     image_context: Dict = {},
     minimum_score: float = 0.0,
     content_categories: List[vision.Feature.Type] = [],
     unsafe_content_categories: List[UnsafeContentCategory] = [],
     **kwargs,
 ):
     store_attr()
Ejemplo n.º 8
0
 def __init__(self,
              vocab_sz,
              emb_sz,
              n_hid,
              n_layers,
              pad_token=1,
              hidden_p=0.2,
              input_p=0.6,
              embed_p=0.1,
              weight_p=0.5,
              bidir=False):
     store_attr('emb_sz,n_hid,n_layers,pad_token')
     self.bs = 1
     self.n_dir = 2 if bidir else 1
     self.encoder = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
     self.encoder_dp = EmbeddingDropout(self.encoder, embed_p)
     self.rnns = nn.ModuleList([
         self._one_rnn(emb_sz if l == 0 else n_hid,
                       (n_hid if l != n_layers - 1 else emb_sz) //
                       self.n_dir, bidir, weight_p, l)
         for l in range(n_layers)
     ])
     self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
     self.input_dp = RNNDropout(input_p)
     self.hidden_dps = nn.ModuleList(
         [RNNDropout(hidden_p) for l in range(n_layers)])
     self.reset()
Ejemplo n.º 9
0
    def __init__(self, max_retries=3):
        """Determine email timing values.

        Args:
            max_retries (int): maximum action attempts before terminating
        """

        store_attr()
Ejemplo n.º 10
0
 def __init__(
     self,
     input_table: str,
     mv_output: str,
     taxa_output: str,
     core_model: str = "en_core_sci_sm",
 ) -> None:
     store_attr("input_table, mv_output, taxa_output, core_model")
Ejemplo n.º 11
0
 def __init__(self,
              email: str,
              archive_paths: str,
              extracted_output: str,
              query: str = 'mud[TIAB] AND volcano[TIAB]') -> None:
     store_attr('email, query')
     self.archive_paths = Path(archive_paths).glob('**/*.gz')
     self.extracted_output = Path(extracted_output)
Ejemplo n.º 12
0
    def __init__(self,
                 in_channels=1,
                 n_classes=2,
                 stride=1,
                 inplanes=64,
                 pre_ssl=True,
                 **kwargs):
        super().__init__()
        store_attr('in_channels, n_classes, inplanes, pre_ssl')
        #encoder
        if pre_ssl:
            m = torch.hub.load(
                'facebookresearch/semi-supervised-ImageNet1K-models',
                'resnext50_32x4d_ssl')
        else:
            m = ResNet(Bottleneck, [3, 4, 6, 3], groups=32, width_per_group=4)
        m.conv1.padding = (0, 0)

        if in_channels < 3:
            #print('Cutting input layer weights to', in_channels, 'channel(s).')
            with torch.no_grad():
                m.conv1.weight = nn.Parameter(m.conv1.weight[:, :in_channels,
                                                             ...])
        elif in_channels > 3:
            m.conv1 = nn.Conv2d(in_channels,
                                self.inplanes,
                                kernel_size=7,
                                stride=2,
                                bias=False)

        #self.bn1 =  m.bn1 if in_channels==3 else nn.BatchNorm2d(self.inplanes)
        self.enc0 = nn.Sequential(m.conv1, m.bn1, nn.ReLU(inplace=True))
        self.enc1 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1),
            m.layer1)  #256
        self.enc2 = m.layer2  #512
        self.enc3 = m.layer3  #1024
        self.enc4 = m.layer4  #2048
        #aspp with customized dilatations
        self.aspp = ASPP(
            2048,
            256,
            out_c=512,
            dilations=[stride * 1, stride * 2, stride * 3, stride * 4])
        self.drop_aspp = nn.Dropout2d(0.5)
        #decoder
        self.dec4 = UnetBlock(512, 1024, 256, padding=0)
        self.dec3 = UnetBlock(256, 512, 128, padding=0)
        self.dec2 = UnetBlock(128, 256, 64, padding=0)
        self.dec1 = UnetBlock(64, 64, 32, padding=0)
        self.fpn = FPN([512, 256, 128, 64], [16] * 4)
        self.drop = nn.Dropout2d(0.1)
        self.final_conv = ConvLayer(32 + 16 * 4,
                                    n_classes,
                                    ks=1,
                                    norm_type=None,
                                    act_cls=None)
 def __init__(
     self,
     text_df,
     text_column,
     tokenizer,
     language,
     language_column=None,
 ):
     store_attr()
Ejemplo n.º 14
0
 def __init__(self, path = Path('.'), data_fn='data', model_fn='model', data_func=None, bs=16, cpu=False, onnx=False):
     data = load_data(path, data_fn)
     self.n_inp = data['n_inp']
     self.pipelines = make_pipelines(data)
     self.after_item = self.pipelines['after_item']
     self.after_batch = self.pipelines['after_batch']
     self.tfm_y = generate_pipeline(data['tfms'], order=False)
     self.model = load_model(path, model_fn, cpu, onnx)
     self.device = 'cpu' if cpu else 'cuda'
     store_attr(self, 'data_func,bs')
     self.decode_func = None
 def __init__(
     self,
     gcp_service_account_key: AnyStr = None,
     gcp_continent: AnyStr = None,
     api_quota_period: int = 60,
     api_quota_rate_limit: int = 1800,
 ):
     store_attr()
     self.client = self.get_client()
     self.call_api_annotate_image = self._build_call_api_annotate_image()
     self.call_api_document_text_detection = self._build_call_api_document_text_detection()
Ejemplo n.º 16
0
    def __init__(
        self,
        stopwords_folder_path: Optional[AnyStr] = None,
        use_models: bool = False,
        hashtags_as_token: bool = True,
        batch_size: int = DEFAULT_BATCH_SIZE,
        max_num_characters: int = MAX_NUM_CHARACTERS,
        add_pipe_components: List[str] = [],
        enable_pipe_components: Optional[Union[List[str], str]] = None,
        disable_pipe_components: Optional[Union[List[str], str]] = None,
        config: dict = {},
    ):
        """Initialization method for the MultilingualTokenizer class, with optional arguments

        Args:
            stopwords_folder_path (str, optional): Path to a folder with stopword text files (one line per stopword).
                Files should be named "{language_code}.txt" with the code in ISO 639-1 format.
            use_models (bool): If True, loads spaCy models, which is slower but allows to retrieve
                Part-of-Speech and Entities tags for downstream tasks. Default is False.
            hashtags_as_token (bool): Treat hashtags as one token instead of two.
                Default is True, which overrides the spaCy default behavior.
            batch_size (int): Number of documents to process in spaCy pipelines.
                Default is set by the DEFAULT_BATCH_SIZE class constant.
            max_num_characters (int): Maximum number of characters in a single text.
                Default is 10 million, higher than spaCy more conservative default at 1 million.
            add_pipe_components (list): List of spaCy pipeline components to add, for instance "sentencizer".
                If use_models is False, only the tokenizer component is present so other components must be added explicitly.
                If use_models is True, several pipeline components are automatically added.
                Please refer to the spaCy documentation to know which components are available for each model.
            enable_pipe_components (list, optional): List of spaCy pipeline components to enable.
                To enable components, they must be added first, either by activating use_models
                or by adding them explicitly in add_pipe_components.
            disable_pipe_components (list, optional): List of spaCy pipeline components to disable.
                To disable components, they must be added first, either by activating use_models
                or by adding them explicitly in add_pipe_components.
                Please use either enable_pipe_components or disable_pipe_components, as both cannot be used at the same time.
            config (dict): Dictionary for SpaCy component(key) and its associated SpaCy.Language.config dictionary (value)
                This config dictionary contains metadatas about the component.
                If empty, uses SpaCy default config, describing the default values of the factory arguments

        """
        store_attr()
        self.spacy_nlp_dict = {}
        self.tokenized_column = None  # may be changed by tokenize_df
        self._restore_pipe_components = {}
        """spacy.language.DisabledPipes object initialized in create_spacy_tokenizer()
        Contains the components of each SpaCy.Language object that have been disabled by spacy.Languages.select_pipes() method.
        Those components can be re-added to each SpaCy.Language at their initial place in the pipeline, by calling restore_pipe_components[language].restore()
        
        """
        if self.enable_pipe_components and self.disable_pipe_components:
            raise ValueError(
                "Only one of enable_pipe_components and disable_pipe_components can be specified at once."
            )
Ejemplo n.º 17
0
 def __init__(self,
              tok,
              rules=None,
              counter=None,
              lengths=None,
              mode=None,
              sep=' '):
     if isinstance(tok, type): tok = tok()
     store_attr('tok,counter,lengths,mode,sep')
     self.rules = defaults.text_proc_rules if rules is None else rules
     print(self.rules)
     print(tok)
Ejemplo n.º 18
0
    def __init__(self, sender, account_id, auth_token):
        """Sends SMS.

        Args:
            sender (str): sender number
            account_id (str): twilio account id
            auth_token (str): twilio auth token
        """

        store_attr()

        self.client = Client(account_id, auth_token)
Ejemplo n.º 19
0
    def __init__(self, emailer_configs, messenger_configs, database):
        """Factory class for creating specific scrapers.

        Args:
            emailer_configs (dict): configs for email sender
            messenger_configs (dict): configs for sms sender
            database (Database): database of items and subscribers
        """

        store_attr()

        self.scrapers_classes = [x for x in Scraper.__subclasses__()]
Ejemplo n.º 20
0
    def __init__(self, server, port, sender, sender_pass, recipient=None):
        """Sends emails.

        Args:
            server (str): email server
            port (str): email server port
            sender (str): sender email address
            sender_pass (str): sender email password
            recipient (list): default email recipients
        """

        super().__init__()
        store_attr()
Ejemplo n.º 21
0
    def __init__(self, items_db_file, subs_db_file):
        """Access databases for items and subscriptions.

        Args:
            items_db_file (str): file path for database of items
            subs_db_file (str): file path for database of subscribers
        """

        store_attr()

        with open(file=self.items_db_file, mode="r") as f:
            self.items_db = load(fp=f)
        with open(file=self.subs_db_file, mode="r") as f:
            self.subs_db = load(fp=f)
Ejemplo n.º 22
0
    def __init__(self,
                 site_load_time=5,
                 poll_time=2,
                 max_refreshes=10,
                 max_wait_time=5):
        """Determine scrape timing values.

        Args:
            site_load_time (int): wait time for a site to load before scraping
            poll_time (int): wait time between scraping a site
            max_refreshes (int): maximum site refreshes before reconnecting
            max_wait_time (int): maximum wait time for a site element to be found during scraping
        """

        store_attr()
Ejemplo n.º 23
0
 def __init__(
     self,
     language_scope: List = SUPPORTED_LANGUAGES_PYCLD3.keys(),
     minimum_score: float = 0.0,
     fallback_language: AnyStr = "",
 ):
     store_attr()
     self.column_descriptions = self.COLUMN_DESCRIPTIONS.copy(
     )  # may be changed by detect_languages_df
     self._langid_identifier = LanguageIdentifier.from_modelstring(
         model, norm_probs=True)
     self._langid_identifier.set_languages([
         l for l in self.language_scope
         if l not in SUPPORTED_LANGUAGES_PYCLD3_NOT_LANGID
     ])
 def __init__(
     self,
     input_df: pd.DataFrame,
     input_folder: dataiku.Folder = None,
     column_prefix: AnyStr = "api",
     error_handling: ErrorHandling = ErrorHandling.LOG,
     parallel_workers: int = DEFAULT_PARALLEL_WORKERS,
     **kwargs,
 ):
     store_attr()
     self.output_df = None  # initialization before calling format_df
     self.api_column_names = build_unique_column_names(
         input_df.keys(), column_prefix)
     self.column_description_dict = {
         column_name: API_COLUMN_NAMES_DESCRIPTION_DICT[key]
         for key, column_name in self.api_column_names._asdict().items()
     }
     self.column_description_dict[
         PATH_COLUMN] = "Path of the file relative to the input folder"
Ejemplo n.º 25
0
    def __init__(
        self,
        tokenizer: MultilingualTokenizer,
        dictionary_folder_path: AnyStr,
        custom_vocabulary_set: Set[AnyStr] = set(),
        custom_corrections: Dict = {},
        edit_distance: int = DEFAULT_EDIT_DISTANCE,
        ignore_token: Pattern = None,
        transfer_casing: bool = True,
        compute_diagnosis: bool = True,
    ):
        """Initialization method for the SpellChecker class, with optional arguments

        Args:
            dictionary_folder_path: Local path to a folder containing SymSpell dictionary files
                Each dictionary file in the folder should be named "xx.txt"
                where xx is the language code in ISO 639-1 format
            custom_vocabulary_set: Optional - Set of words that should not be corrected
            custom_corrections: Optional - Dictionary of words (key) and their custom correction (value)
            edit_distance: Maximum edit distance between a word and its correction.
                Default is 2, which is SymSpell recommendation for reasonable speed and quality
            ignore_token: Regular expression for words not to be corrected
                Should be a compiled regex object, use re.compile beforehand
            transfer_casing (bool): If True, transfer input word case to the corrected word
                Default is True, which works well for European languages
            compute_diagnosis (bool): If True, compute spellchecker diagnosis of each word
                Adds ~20% processing time but allows to understand what the spellchecker did

        """
        store_attr()
        self._symspell_checker_dict = {}
        self.output_column_descriptions = (
            self.OUTPUT_COLUMN_DESCRIPTIONS.copy()
        )  # may be changed by `_prepare_df_for_spellchecker`
        if self.compute_diagnosis:
            self._diagnosis_lock = Lock()
            self._token_dict = {
                k: Counter()
                for k in SUPPORTED_LANGUAGES_SYMSPELL
            }  # may be changed by check_token
            self._diagnosis_list = []  # may be changed by check_token
Ejemplo n.º 26
0
 def __init__(
     self,
     language: AnyStr,
     tokenizer: MultilingualTokenizer,
     category_column: AnyStr,
     ignore_case: bool,
     lemmatization: bool,
     ignore_diacritics: bool,
     text_column_tokenized: AnyStr,
     _use_nfc: bool,
     tag_columns: List[AnyStr],
     _keyword_to_tag: dict = None,
     _matcher_dict: dict = None,
 ):
     store_attr()
     self.output_df = (
         pd.DataFrame()
     )  # pandas.DataFrame with new columns concerning the found tags
     tqdm.pandas(miniters=1, mininterval=5.0)
     self.column_descriptions = {}
     """Dictionary of new columns to add in the dataframe (key) and their descriptions (value)
Ejemplo n.º 27
0
 def __init__(
     self,
     model_name: str,
     batch_size: int = 8,
     attention_probs_dropout_prob: float = 0.4,
     learning_rate: float = 5e-7,
     adam_epsilon: float = 1e-8,
     hidden_dropout_prob: float = 0.3,
     epochs: int = 3,
     lm_model_dir: str = None,
     wname=None,
     drivepath="../drive/My\ Drive/HinglishNLP/repro",
 ):
     store_attr()
     self.timestamp = str(datetime.now().strftime("%d.%m.%y"))
     if not self.wname:
         self.wname = self.model_name
     wandb.init(
         project="hinglish",
         config={
             "model_name": self.model_name,
             "batch_size": self.batch_size,
             "attention_probs_dropout_prob":
             self.attention_probs_dropout_prob,
             "learning_rate": self.learning_rate,
             "adam_epsilon": self.adam_epsilon,
             "hidden_dropout_prob": self.hidden_dropout_prob,
             "epochs": self.epochs,
         },
         name=f"{self.wname} {self.timestamp}",
     )
     print({"Model Info": f"Setup self.model training for {model_name}"})
     self.device = check_for_gpu(self.model_name)
     if not lm_model_dir:
         if self.model_name == "bert":
             self.lm_model_dir = "model_save"
         elif self.model_name == "distilbert":
             self.lm_model_dir = "distilBert6"
         elif self.model_name == "roberta":
             self.lm_model_dir = "roberta3"
 def __init__(
     self, content_categories: List[vision.Feature.Type], minimum_score: float = 0, max_results: int = 10, **kwargs,
 ):
     store_attr()
     self._compute_column_description()
Ejemplo n.º 29
0
 def __init__(self, data: Dict[str, Any], timeline: Dict[str, Any]):
     store_attr()
Ejemplo n.º 30
0
 def __init__(self, vocab=None, sort=True, add_na=False):
     if vocab is not None:
         vocab = CategoryMap(vocab, sort=sort, add_na=add_na)
     store_attr()