def make_graph(self, instr_file): """Creates a graph from the given instruction file""" err_msg = "couldn't read instructions" self.uniquified = False with utils.ringo_open(instr_file, err_msg) as f: # Execute query try: for line in f: tokens = list(utils.get_tokens(line)) { 'SRC': self.set_src, 'DST': self.set_dst, 'EDGE_ATTR': self.set_edge_attr, 'FLAGS': self.set_flags, 'LOAD': self.load, 'START': self.start, 'LABEL': self.label, 'JOIN': self.join, 'SELECT': self.select, 'COUNT': self.count, 'GROUP': self.group, 'ORDER': self.order }[tokens[0]](*tokens[1:]) except KeyError: raise InvalidInstructionException('Incomplete query') self.build_graph() return
def feature_value(self, name): # TODO: can probably use the local attrs dictionary for many of these if name == 'eventStatus': return '1' elif name == 'nodeType': return self.__class__.__name__ elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM): return self.tree.events[self.eid][name] elif name == MOD: return self._get_attribute(name, 'NONE') elif name == POL: return self._get_attribute(name, 'POS') elif name in ('text', FORM): if self.tree.events.has_key(self.eid): return self.tree.events[self.eid][FORM] else: logger.warn("Event %s is not stored in the events on the TarsqiTree" % self) return ' '.join([t.text for t in get_tokens(self)]) elif name == POS: try: return self.tree.events[self.eid][POS] except: # I don't remember whether POS has a particular use here # or is a left over from prior times logger.warn("Returning 'epos' instead of 'pos' value") return self.tree.events[self.eid][EPOS] else: raise AttributeError, name
def parse_text(tw_obj): # remove use mentions, urls from the text # use extended tweet if presents if 'extended_tweet' in tw_obj: text = tw_obj['extended_tweet']['full_text'] # or use normal text else: text = tw_obj['text'] # process quoted tweet and append to text if tw_obj['is_quote_status'] and 'quoted_status' in tw_obj: # process quoted tweet qt_obj = tw_obj['quoted_status'] if 'extended_tweet' in qt_obj: qt_text = qt_obj['extended_tweet']['full_text'] # or use normal text else: qt_text = qt_obj['text'] text = ''.join([text, ' %QUOTES% ', qt_text]) text_norm = normalizeTextForTagger(replace_sp_tokens(text)) # process text into list of keywords text_tokens = get_tokens(text) text_tokens = [t for t in text_tokens if t not in stopwords] token_counts = dict(Counter(itertools.chain(*[text_tokens]))) # text_tokens = [lemma(t) for t in text_tokens] return text, text_norm, text_tokens, token_counts
def preprocess(doc, query): if query: tokens = utils.get_tokens(doc) doc = utils.removeStopWords(tokens) lemmaWords = utils.lemmatizer(doc) cleanWords = utils.cleanText(lemmaWords) return cleanWords
def feature_value(self, name): # TODO: can probably use the local attrs dictionary for many of these if name == 'eventStatus': return '1' elif name == 'nodeType': return self.__class__.__name__ elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM): return self.tree.events[self.eid][name] elif name == MOD: return self._get_attribute(name, 'NONE') elif name == POL: return self._get_attribute(name, 'POS') elif name in ('text', FORM): if self.tree.events.has_key(self.eid): return self.tree.events[self.eid][FORM] else: logger.warn( "Event %s is not stored in the events on the TarsqiTree" % self) return ' '.join([t.text for t in get_tokens(self)]) elif name == POS: try: return self.tree.events[self.eid][POS] except: # I don't remember whether POS has a particular use here # or is a left over from prior times logger.warn("Returning 'epos' instead of 'pos' value") return self.tree.events[self.eid][EPOS] else: raise AttributeError, name
def auto_complete_places(place, region): #récupération des variables et mise en place de l'URL de requête #pt_objects fait référence à des objets de transport en commun looking_for = 'pt_objects?' #q= est la commande qui me permet de faire appel à l'autocomplétion query = 'q=' + ''.join(place.split()) #type[]=stop_area permet de dire qu'on ne veut que des arrêt de transports en commun object_type = 'type[]=stop_area' #Concaténation de l'URL final avec le ROOT_URL définit plus haut et la région passée en paramètre url_final = ROOT_URL + region + '/' + looking_for + query + '&' + object_type print("Requesting @ " + url_final) #requète sur l'API grâce à la librairie requests avec l'ajout du token de manière dynamique data = requests.get(url=url_final, auth=(get_tokens('navitia'), '')) #on stocke le tableau de résultats dans data dict_results = {} if 'pt_objects' not in data.json(): return dict_results data = data.json()['pt_objects'] #on fait une boucle qui va stocker dans un dictionnaire sous forme de key:value le nom commercial de l'arrêt et son ID unique for result in data: dict_results.update( {result['stop_area']['name']: result['stop_area']['id']}) return dict_results
def main(): #créer l'updater en lui passant le token de l'API telegram updater = Updater(get_tokens('telegram')) #stoker le dispatcher pour y ajouter les handlers dispatcher = updater.dispatcher #ajout du conversation handler avec les états DESTINATION, AUTOCOMPLETE 1, DEPARTURE, AUTOCOMPLETE 2 , et DATETIME conv_handler = ConversationHandler( entry_points=[CommandHandler('r', recherche)], states={ DEPARTURE: [ MessageHandler(Filters.text & ~Filters.command, auto_complete_dep), ], AUTOCOMPLETE_DEP: [CallbackQueryHandler(destination)], DESTINATION: [ MessageHandler(Filters.text & ~Filters.command, auto_complete_dest) ], AUTOCOMPLETE_DEST: [CallbackQueryHandler(datetime)], DATETIME: [MessageHandler(Filters.text & ~Filters.command, result)] }, fallbacks=[CommandHandler('cancel', cancel)]) #ajout du conversation handler au dispatcher (tous les autres handlers sont ajouter implicitement) dispatcher.add_handler(conv_handler) #démare le bot updater.start_polling() updater.idle()
def get_text_language(text): language_rank = {} tokens = utils.get_tokens(text) for language in language_helper.get_languages(): c_stopwords = language_helper.get_language_stopwords(language) language_rank[language] = reduce(lambda carry, curr: carry + 1 if curr in c_stopwords else carry, tokens, 0) sorted_languages = sorted(language_rank.iteritems(), key=lambda x: -x[1]) return sorted_languages[0][0]
def explain(self, text, nwords, return_weights=False): ''' Use `LimeTextExplainer` to obtain the top `nwords` most important/polar words in the `text` as an explanation. Parameters -------------- text: str The text to explain. nwords: int The number of most important words to return (i.e. explanation size). return_weights: bool Set to True to return the weights assigned by LIME also. Returns --------------- word_ranking : list Indexes of the `nwords` top-ranked words in the text. ranked_words: list List of `nwords` top-ranked words in the text. weights: dict, optional The dictionary of weights (wordposition -> weight) assigned by LIME to the words in the text. explanation: optional The explanation object returned by `LimeTextExplainer`. ''' text = preprocess_text(text) text_words = get_tokens(text) class_names = ['negative', 'positive'] # bow is set to False because word order is important explainer = LimeTextExplainer(class_names=class_names, feature_selection='auto', bow=False, split_expression=' ', verbose=False) explanation = explainer.explain_instance( text_instance=text, labels=[0, 1], classifier_fn=self.predict_texts, num_features=nwords, num_samples=self.nsamples) # sort weights by decreasing absolute value weights = OrderedDict( sorted(explanation.as_map()[1], key=lambda weight: -abs(weight[1]))) word_ranking = np.array(list(weights.keys())) ranked_words = [text_words[i] for i in word_ranking] if return_weights: return word_ranking, ranked_words, weights, explanation return word_ranking, ranked_words
def index_file(inv_idx, file_buf, file_name): tokens = None if file_name.endswith('.pdf'): text = '\n'.join(pdftotext.PDF(file_buf)) tokens = get_tokens(text, False) elif file_name.endswith('.txt'): text = file_buf.read().decode() tokens = get_tokens(text, True) if tokens is not None: if file_name.endswith('.pdf'): id = file_name for text, para in tokens: index_words(inv_idx, para, id, text) elif file_name.endswith('.txt'): for i, (text, para) in enumerate(tokens, 0): id = f'{file_name}_para_{i}' index_words(inv_idx, para, id, text)
def get_adv_text(orig_text, used_replacements): ''' Apply replacements to text to obtain adversarial text. ''' text_words = get_tokens(orig_text) for (pos, word, replacement_word) in used_replacements: assert text_words[pos] == word, 'pos = %d, text_word = %s , word = %s' % (pos, text_words[pos], word) text_words[pos] = replacement_word return ' '.join(text_words)
def __init__(self): self.twitterTokens = get_tokens() http_proxy, https_proxy = get_proxy() self.api = twitter.Api(consumer_key=self.twitterTokens['consumer_key'], consumer_secret=self.twitterTokens['consumer_secret'], access_token_key=self.twitterTokens['access_token'], access_token_secret=self.twitterTokens['access_token_secret'], proxies={'http':http_proxy, 'https':https_proxy})
def explain_text_words(self, text, rank_by_importance=True): ''' Word level explanation. ''' text = preprocess_text(text) text_words = get_tokens(text) y = self.model.predict_class(text) word_ranking, values = self.sbe(text_words, y, rank_by_importance) ranked_words = [text_words[i] for i in word_ranking] return word_ranking, ranked_words, values
def attack(self,text, target_class, search_algorithm, random_attack = False): ''' Attack text to change the prediction to `target_class`. Parameters ----------------- text: str The text to attack. target_class: int The class to change the classification to. search_algorithm: str The search algorithm to use in attack the text : greedy or beam. random_attack: bool, optional Randomly selects words to target for attack ''' text = preprocess_text(text) x = get_tokens(text) explanation_size = int(self.percentage * len(x)) if self.explainer is None : # target all words print("No explainer provided . Targeting all words in the input ... ") candidate_words_indexes = np.arange(len(x)) candidate_words = np.array(x)[candidate_words_indexes].tolist() elif not random_attack : print('Generating explanation...') candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size) else : print("Randomly selecting candidate words to perturb...") candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False) candidate_words = np.array(x)[candidate_words_indexes].tolist() assert len(candidate_words_indexes) == len(candidate_words) print("Extracted candidate words: ", candidate_words) synonyms_map = self.build_synonyms_map(candidate_words) print("Built synonyms map.") candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map) print("Filtered replacements.") Attacker.print_candidate_stats(candidate_replacements) #print("candidate_replacements: ") #pprint(candidate_replacements) if search_algorithm == 'greedy': print('Running greedy search...') used_replacements, adversary_found, prediction = self.greedy_search(x,candidate_replacements, target_class) elif search_algorithm == 'beam': print('Running beam search...') used_replacements, adversary_found, prediction = self.beam_search(x, candidate_replacements, target_class) else : raise ValueError('Invalid search algorithm provided') print("Chose replacements.") # Generate adversarial text adv_text = Attacker.get_adv_text(text, used_replacements) return used_replacements, adversary_found, adv_text, prediction
def fix(self, text, target_class, beam_size = 4, random_fix = False): ''' Change the classification of a text to the correct class. Parameters ------------ text: str The text that is misclassified. target_class: int The label of the class to change the prediction to beam_size: int random_fix: Boolean, Optional If set to True, words will be targeted randomly for replacement. Returns ---------------- suggestions: list The list of suggested replacement sets. ''' text = preprocess_text(text) x = get_tokens(text) explanation_size = int(self.percentage * len(x)) if self.explainer is None : # target all words print("No explainer provided . Targeting all words in the input ... ") candidate_words_indexes = np.arange(len(x)) candidate_words = np.array(x)[candidate_words_indexes].tolist() elif not random_fix : print('Generating explanation...') candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size) else : print("Randomly selecting candidate words to perturb...") candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False) candidate_words = np.array(x)[candidate_words_indexes].tolist() print("Extracted candidate words: ", candidate_words) synonyms_map = self.build_synonyms_map(candidate_words) print("Built synonyms map.") candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map) print('Filtered replacements.') print('Running beam search...') suggestions = self.beam_search(x, candidate_replacements, target_class, beam_size = beam_size, return_multiple = True) return suggestions
def get_journeys(departure_point, arrival_point, departure_date, region): mode = 'journeys?' starting_from = 'from=' + departure_point going_to = 'to=' + arrival_point at_time = 'datetime=' + departure_date url_final = ROOT_URL + region + mode + starting_from + '&' + going_to + '&' + at_time data = requests.get( url=url_final, auth=(get_tokens('navitia'), '')) #url = recherche vers navitia et auth = notre username/pwd print(url_final) data = data.json()["journeys"][ 0] #on stock dans la variable data la partie du json intéréssante pour la suite du processus #je crée un objet journey et je lui donne les données json journey = Journey( data["sections"][1]["from"] ["name"], #je récupère le nom du point de départ data["sections"][1]["to"] ["name"], #je récupère le nom du point d'arrivée data[ "requested_date_time"], #je récupère la date demandé par l'utilisateur data["departure_date_time"], #je récupère la date de départ du trajet data["arrival_date_time"], #je récupère la date d arrivée du trajet data["duration"], #je récupère le temps du trajet data["sections"][1]["display_informations"] ["physical_mode"], #je récupère le type de transport en commun data["sections"][1]["display_informations"] ["name"], #je récupère le nom du trajet data["sections"][1]["display_informations"] ["network"], #je récupère le nom du réseau de transport data["sections"][1]["display_informations"] ["trip_short_name"], #je récupère l'ID du trajet data["sections"][1]["stop_date_times"] ) #je récupère le tableau de tout les arrets du trajet return journey
import os path_pmb = '../Data/pmb/pmb-2.1.0/data/gold' language_doc_dict = sort_docs(path_pmb) languages = language_doc_dict.keys() for language in languages: n_docs = len(language_doc_dict[language]) docs = language_doc_dict[language] #your code here tokens_n = [] for doc in docs: path_to_doc = f'{doc}/{language}.drs.xml' tokens = get_tokens(path_to_doc) length = len(tokens) tokens_n.append(length) n_tokens = sum(tokens_n) print(f'{language}: num docs: {n_docs}, num tokens: {n_tokens}') pairs = get_pairs(languages) print(pairs) for lang1, lang2 in pairs: docs_lang1 = language_doc_dict[lang1] docs_lang2 = language_doc_dict[lang2] #print(len(docs_lang1)) #print(len(docs_lang2)) number_of_docs = len(docs_lang1.intersection(docs_lang2)) #print(len(docs_in_two))
def preprocess(doc): tokens = utils.get_tokens(doc) tokensWOStopWords = utils.removeStopWords(tokens) cleanWords = utils.cleanText(tokensWOStopWords) return cleanWords
def build_vocab(data: Iterable[str], num_words: Optional[int] = None, min_count: int = 1, pad_to_multiple_of: Optional[int] = None) -> Vocab: """ Creates a vocabulary mapping from words to ids. Increasing integer ids are assigned by word frequency, using lexical sorting as a tie breaker. The only exception to this are special symbols such as the padding symbol (PAD). :param data: Sequence of sentences containing whitespace delimited tokens. :param num_words: Optional maximum number of words in the vocabulary. :param min_count: Minimum occurrences of words to be included in the vocabulary. :param pad_to_multiple_of: If not None, pads the vocabulary to a size that is the next multiple of this int. :return: Word-to-id mapping. """ vocab_symbols_set = set(C.VOCAB_SYMBOLS) raw_vocab = Counter(token for line in data for token in utils.get_tokens(line) if token not in vocab_symbols_set) # For words with the same count, they will be ordered reverse alphabetically. # Not an issue since we only care for consistency pruned_vocab = [ w for c, w in sorted(((c, w) for w, c in raw_vocab.items() if c >= min_count), reverse=True) ] if num_words is not None: vocab = list(islice(pruned_vocab, num_words)) num_words_log = str(num_words) else: vocab = pruned_vocab num_words_log = "None" if pad_to_multiple_of is not None: current_vocab_size = len(vocab) + len(C.VOCAB_SYMBOLS) rest = current_vocab_size % pad_to_multiple_of padded_vocab_size = current_vocab_size if rest == 0 else current_vocab_size + pad_to_multiple_of - rest logger.info("Padding vocabulary to a multiple of %d: %d -> %d", pad_to_multiple_of, current_vocab_size, padded_vocab_size) pad_entries = [ C.PAD_FORMAT % idx for idx in range(current_vocab_size, padded_vocab_size) ] pad_to_multiple_log = str(pad_to_multiple_of) else: pad_entries = [] pad_to_multiple_log = "None" word_to_id = { word: idx for idx, word in enumerate(chain(C.VOCAB_SYMBOLS, vocab, pad_entries)) } logger.info( "Vocabulary: types: %d/%d/%d/%d (initial/min_pruned/max_pruned/+special) " + "[min_frequency=%d, max_num_types=%s, pad_to_multiple_of=%s]", len(raw_vocab), len(pruned_vocab), len(vocab), len(word_to_id), min_count, num_words_log, pad_to_multiple_log) # Important: pad symbol becomes index 0 assert word_to_id[C.PAD_SYMBOL] == C.PAD_ID return word_to_id