Esempio n. 1
0
 def make_graph(self, instr_file):
     """Creates a graph from the given instruction file"""
     err_msg = "couldn't read instructions"
     self.uniquified = False
     with utils.ringo_open(instr_file, err_msg) as f:
         # Execute query
         try:
             for line in f:
                 tokens = list(utils.get_tokens(line))
                 {
                     'SRC': self.set_src,
                     'DST': self.set_dst,
                     'EDGE_ATTR': self.set_edge_attr,
                     'FLAGS': self.set_flags,
                     'LOAD': self.load,
                     'START': self.start,
                     'LABEL': self.label,
                     'JOIN': self.join,
                     'SELECT': self.select,
                     'COUNT': self.count,
                     'GROUP': self.group,
                     'ORDER': self.order
                 }[tokens[0]](*tokens[1:])
         except KeyError:
             raise InvalidInstructionException('Incomplete query')
         self.build_graph()
     return
Esempio n. 2
0
File: tags.py Progetto: tarsqi/ttk
 def feature_value(self, name):
     # TODO: can probably use the local attrs dictionary for many of these
     if name == 'eventStatus':
         return '1'
     elif name == 'nodeType':
         return self.__class__.__name__
     elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM):
         return self.tree.events[self.eid][name]
     elif name == MOD:
         return self._get_attribute(name, 'NONE')
     elif name == POL:
         return self._get_attribute(name, 'POS')
     elif name in ('text', FORM):
         if self.tree.events.has_key(self.eid):
             return self.tree.events[self.eid][FORM]
         else:
             logger.warn("Event %s is not stored in the events on the TarsqiTree" % self)
             return ' '.join([t.text for t in get_tokens(self)])
     elif name == POS:
         try:
             return self.tree.events[self.eid][POS]
         except:
             # I don't remember whether POS has a particular use here
             # or is a left over from prior times
             logger.warn("Returning 'epos' instead of 'pos' value")
             return self.tree.events[self.eid][EPOS]
     else:
         raise AttributeError, name
Esempio n. 3
0
    def parse_text(tw_obj):
        # remove use mentions, urls from the text
        # use extended tweet if presents
        if 'extended_tweet' in tw_obj:
            text = tw_obj['extended_tweet']['full_text']
        # or use normal text
        else:
            text = tw_obj['text']

        # process quoted tweet and append to text
        if tw_obj['is_quote_status'] and 'quoted_status' in tw_obj:
            # process quoted tweet
            qt_obj = tw_obj['quoted_status']
            if 'extended_tweet' in qt_obj:
                qt_text = qt_obj['extended_tweet']['full_text']
            # or use normal text
            else:
                qt_text = qt_obj['text']
            text = ''.join([text, ' %QUOTES% ', qt_text])

        text_norm = normalizeTextForTagger(replace_sp_tokens(text))
        # process text into list of keywords
        text_tokens = get_tokens(text)
        text_tokens = [t for t in text_tokens if t not in stopwords]
        token_counts = dict(Counter(itertools.chain(*[text_tokens])))
        # text_tokens = [lemma(t) for t in text_tokens]

        return text, text_norm, text_tokens, token_counts
def preprocess(doc, query):
    if query:
        tokens = utils.get_tokens(doc)
        doc = utils.removeStopWords(tokens)
    lemmaWords = utils.lemmatizer(doc)
    cleanWords = utils.cleanText(lemmaWords)
    return cleanWords
Esempio n. 5
0
File: tags.py Progetto: jasonzou/ttk
 def feature_value(self, name):
     # TODO: can probably use the local attrs dictionary for many of these
     if name == 'eventStatus':
         return '1'
     elif name == 'nodeType':
         return self.__class__.__name__
     elif name in (EVENTID, EIID, CLASS, TENSE, ASPECT, EPOS, STEM):
         return self.tree.events[self.eid][name]
     elif name == MOD:
         return self._get_attribute(name, 'NONE')
     elif name == POL:
         return self._get_attribute(name, 'POS')
     elif name in ('text', FORM):
         if self.tree.events.has_key(self.eid):
             return self.tree.events[self.eid][FORM]
         else:
             logger.warn(
                 "Event %s is not stored in the events on the TarsqiTree" %
                 self)
             return ' '.join([t.text for t in get_tokens(self)])
     elif name == POS:
         try:
             return self.tree.events[self.eid][POS]
         except:
             # I don't remember whether POS has a particular use here
             # or is a left over from prior times
             logger.warn("Returning 'epos' instead of 'pos' value")
             return self.tree.events[self.eid][EPOS]
     else:
         raise AttributeError, name
Esempio n. 6
0
def auto_complete_places(place, region):

    #récupération des variables et mise en place de l'URL de requête
    #pt_objects fait référence à des objets de transport en commun
    looking_for = 'pt_objects?'
    #q= est la commande qui me permet de faire appel à l'autocomplétion
    query = 'q=' + ''.join(place.split())
    #type[]=stop_area permet de dire qu'on ne veut que des arrêt de transports en commun
    object_type = 'type[]=stop_area'
    #Concaténation de l'URL final avec le ROOT_URL définit plus haut et la région passée en paramètre
    url_final = ROOT_URL + region + '/' + looking_for + query + '&' + object_type

    print("Requesting @ " + url_final)
    #requète sur l'API grâce à la librairie requests avec l'ajout du token de manière dynamique
    data = requests.get(url=url_final, auth=(get_tokens('navitia'), ''))
    #on stocke le tableau de résultats dans data
    dict_results = {}

    if 'pt_objects' not in data.json():
        return dict_results

    data = data.json()['pt_objects']

    #on fait une boucle qui va stocker dans un dictionnaire sous forme de key:value le nom commercial de l'arrêt et son ID unique
    for result in data:
        dict_results.update(
            {result['stop_area']['name']: result['stop_area']['id']})

    return dict_results
Esempio n. 7
0
 def make_graph(self, instr_file):
   """Creates a graph from the given instruction file"""
   err_msg = "couldn't read instructions"
   self.uniquified = False
   with utils.ringo_open(instr_file, err_msg) as f:
     # Execute query
     try:
       for line in f:
         tokens = list(utils.get_tokens(line))
         {
           'SRC': self.set_src,
           'DST': self.set_dst,
           'EDGE_ATTR': self.set_edge_attr,
           'FLAGS': self.set_flags,
           'LOAD': self.load,
           'START': self.start,
           'LABEL': self.label,
           'JOIN': self.join,
           'SELECT': self.select,
           'COUNT': self.count,
           'GROUP': self.group,
           'ORDER': self.order
         }[tokens[0]](*tokens[1:])
     except KeyError:
       raise InvalidInstructionException('Incomplete query')
     self.build_graph()
   return
Esempio n. 8
0
def main():
    #créer l'updater en lui passant le token de l'API telegram
    updater = Updater(get_tokens('telegram'))

    #stoker le dispatcher pour y ajouter les handlers
    dispatcher = updater.dispatcher

    #ajout du conversation handler avec les états DESTINATION, AUTOCOMPLETE 1, DEPARTURE, AUTOCOMPLETE 2 , et DATETIME
    conv_handler = ConversationHandler(
        entry_points=[CommandHandler('r', recherche)],
        states={
            DEPARTURE: [
                MessageHandler(Filters.text & ~Filters.command,
                               auto_complete_dep),
            ],
            AUTOCOMPLETE_DEP: [CallbackQueryHandler(destination)],
            DESTINATION: [
                MessageHandler(Filters.text & ~Filters.command,
                               auto_complete_dest)
            ],
            AUTOCOMPLETE_DEST: [CallbackQueryHandler(datetime)],
            DATETIME:
            [MessageHandler(Filters.text & ~Filters.command, result)]
        },
        fallbacks=[CommandHandler('cancel', cancel)])

    #ajout du conversation handler au dispatcher (tous les autres handlers sont ajouter implicitement)
    dispatcher.add_handler(conv_handler)

    #démare le bot
    updater.start_polling()

    updater.idle()
def get_text_language(text):
    language_rank = {}
    tokens = utils.get_tokens(text)
    for language in language_helper.get_languages():
        c_stopwords = language_helper.get_language_stopwords(language)
        language_rank[language] = reduce(lambda carry, curr: carry + 1 if curr in c_stopwords else carry, tokens, 0)
    sorted_languages = sorted(language_rank.iteritems(), key=lambda x: -x[1])
    return sorted_languages[0][0]
    def explain(self, text, nwords, return_weights=False):
        '''
        Use `LimeTextExplainer` to obtain the top `nwords` most important/polar words in the `text` as 
        an explanation.


        Parameters
        --------------
        text: str
            The text to explain.

        nwords: int
            The number of most important words to return (i.e. explanation size).

        return_weights: bool
            Set to True to return the weights assigned by LIME also.

        Returns
        ---------------
        word_ranking : list
            Indexes of the `nwords` top-ranked words in the text.
        
        ranked_words: list
            List of `nwords` top-ranked words in the text.

        weights: dict, optional
            The dictionary of weights (wordposition -> weight) assigned by LIME to the words
            in the text.

        explanation: optional
            The explanation object returned by `LimeTextExplainer`.
        '''
        text = preprocess_text(text)
        text_words = get_tokens(text)

        class_names = ['negative', 'positive']
        # bow is set to False because word order is important
        explainer = LimeTextExplainer(class_names=class_names,
                                      feature_selection='auto',
                                      bow=False,
                                      split_expression=' ',
                                      verbose=False)

        explanation = explainer.explain_instance(
            text_instance=text,
            labels=[0, 1],
            classifier_fn=self.predict_texts,
            num_features=nwords,
            num_samples=self.nsamples)
        # sort weights by decreasing absolute value
        weights = OrderedDict(
            sorted(explanation.as_map()[1],
                   key=lambda weight: -abs(weight[1])))
        word_ranking = np.array(list(weights.keys()))
        ranked_words = [text_words[i] for i in word_ranking]
        if return_weights:
            return word_ranking, ranked_words, weights, explanation
        return word_ranking, ranked_words
Esempio n. 11
0
def index_file(inv_idx, file_buf, file_name):
    tokens = None
    if file_name.endswith('.pdf'):
        text = '\n'.join(pdftotext.PDF(file_buf))
        tokens = get_tokens(text, False)
    elif file_name.endswith('.txt'):
        text = file_buf.read().decode()
        tokens = get_tokens(text, True)

    if tokens is not None:
        if file_name.endswith('.pdf'):
            id = file_name
            for text, para in tokens:
                index_words(inv_idx, para, id, text)
        elif file_name.endswith('.txt'):
            for i, (text, para) in enumerate(tokens, 0):
                id = f'{file_name}_para_{i}'
                index_words(inv_idx, para, id, text)
Esempio n. 12
0
 def get_adv_text(orig_text, used_replacements):
     '''
     Apply replacements to text to obtain adversarial text.
     '''
     text_words = get_tokens(orig_text)
     for (pos, word, replacement_word) in used_replacements:
         assert text_words[pos] == word, 'pos = %d, text_word = %s , word = %s' % (pos, text_words[pos], word)
         text_words[pos] = replacement_word
     return ' '.join(text_words)
    def __init__(self):
        self.twitterTokens = get_tokens()

        http_proxy, https_proxy = get_proxy()

        self.api = twitter.Api(consumer_key=self.twitterTokens['consumer_key'],
                          consumer_secret=self.twitterTokens['consumer_secret'],
                          access_token_key=self.twitterTokens['access_token'],
                          access_token_secret=self.twitterTokens['access_token_secret'],
                               proxies={'http':http_proxy, 'https':https_proxy})
 def explain_text_words(self, text, rank_by_importance=True):
     '''
     Word level explanation.
     '''
     text = preprocess_text(text)
     text_words = get_tokens(text)
     y = self.model.predict_class(text)
     word_ranking, values = self.sbe(text_words, y, rank_by_importance)
     ranked_words = [text_words[i] for i in word_ranking]
     return word_ranking, ranked_words, values
Esempio n. 15
0
    def attack(self,text, target_class, search_algorithm, random_attack = False):
        '''
        Attack text to change the prediction to `target_class`.

        Parameters
        -----------------
        text: str
            The text to attack.
        
        target_class: int
            The class to change the classification to.

        search_algorithm: str
            The search algorithm to use in attack the text : greedy or beam.

        random_attack: bool, optional
            Randomly selects words to target for attack

        '''
        text = preprocess_text(text)
        x = get_tokens(text)
        explanation_size = int(self.percentage * len(x))
        if self.explainer is None : # target all words
            print("No explainer provided .  Targeting all words in the input ... ")
            candidate_words_indexes = np.arange(len(x))
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        elif not random_attack :
            print('Generating explanation...')
            candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size)
        else :
            print("Randomly selecting candidate words to perturb...")
            candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False)
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        assert len(candidate_words_indexes) == len(candidate_words)
        print("Extracted candidate words: ", candidate_words)
        synonyms_map = self.build_synonyms_map(candidate_words)
        print("Built synonyms map.")
        candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map)
        print("Filtered replacements.")
        Attacker.print_candidate_stats(candidate_replacements)
        #print("candidate_replacements: ")
        #pprint(candidate_replacements)
        if search_algorithm == 'greedy':
            print('Running greedy search...')
            used_replacements, adversary_found, prediction = self.greedy_search(x,candidate_replacements, target_class)
        elif search_algorithm == 'beam':
            print('Running beam search...')
            used_replacements, adversary_found, prediction = self.beam_search(x, candidate_replacements, target_class)
        else :
            raise ValueError('Invalid search algorithm provided')
        print("Chose replacements.")

        # Generate adversarial text
        adv_text = Attacker.get_adv_text(text, used_replacements)
        return used_replacements, adversary_found, adv_text, prediction
Esempio n. 16
0
    def fix(self, text, target_class, beam_size = 4, random_fix = False):
        '''
        Change the classification of a text to the correct class.

        Parameters
        ------------
        text: str
            The text that is misclassified.
        
        target_class: int
            The label of the class to change the prediction to

        beam_size: int

        random_fix: Boolean, Optional
            If set to True, words will be targeted randomly for replacement.


        Returns
        ----------------
        suggestions: list
            The list of suggested replacement sets.


        '''
        text = preprocess_text(text)
        x = get_tokens(text)
        explanation_size = int(self.percentage * len(x))
        if self.explainer is None : # target all words
            print("No explainer provided .  Targeting all words in the input ... ")
            candidate_words_indexes = np.arange(len(x))
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        elif not random_fix :
            print('Generating explanation...')
            candidate_words_indexes, candidate_words = self.explainer.explain(text, explanation_size)
        else :
            print("Randomly selecting candidate words to perturb...")
            candidate_words_indexes = np.random.choice(len(x), explanation_size , replace = False)
            candidate_words = np.array(x)[candidate_words_indexes].tolist()
        print("Extracted candidate words: ", candidate_words)
        synonyms_map = self.build_synonyms_map(candidate_words)
        print("Built synonyms map.")
        candidate_replacements = self.get_valid_replacements(x, candidate_words_indexes, synonyms_map)
        print('Filtered replacements.')
        print('Running beam search...')
        suggestions = self.beam_search(x, candidate_replacements, target_class, beam_size = beam_size, return_multiple = True)
        return suggestions
Esempio n. 17
0
def get_journeys(departure_point, arrival_point, departure_date, region):

    mode = 'journeys?'
    starting_from = 'from=' + departure_point
    going_to = 'to=' + arrival_point
    at_time = 'datetime=' + departure_date

    url_final = ROOT_URL + region + mode + starting_from + '&' + going_to + '&' + at_time

    data = requests.get(
        url=url_final,
        auth=(get_tokens('navitia'),
              ''))  #url = recherche vers navitia et auth = notre username/pwd

    print(url_final)

    data = data.json()["journeys"][
        0]  #on stock dans la variable data la partie du json intéréssante pour la suite du processus

    #je crée un objet journey et je lui donne les données json
    journey = Journey(
        data["sections"][1]["from"]
        ["name"],  #je récupère le nom du point de départ
        data["sections"][1]["to"]
        ["name"],  #je récupère le nom du point d'arrivée
        data[
            "requested_date_time"],  #je récupère la date demandé par l'utilisateur
        data["departure_date_time"],  #je récupère la date de départ du trajet
        data["arrival_date_time"],  #je récupère la date d arrivée du trajet
        data["duration"],  #je récupère le temps du trajet
        data["sections"][1]["display_informations"]
        ["physical_mode"],  #je récupère le type de transport en commun
        data["sections"][1]["display_informations"]
        ["name"],  #je récupère le nom du trajet
        data["sections"][1]["display_informations"]
        ["network"],  #je récupère le nom du réseau de transport
        data["sections"][1]["display_informations"]
        ["trip_short_name"],  #je récupère l'ID du trajet
        data["sections"][1]["stop_date_times"]
    )  #je récupère le tableau de tout les arrets du trajet

    return journey
import os

path_pmb = '../Data/pmb/pmb-2.1.0/data/gold'
language_doc_dict = sort_docs(path_pmb)

languages = language_doc_dict.keys()

for language in languages:
    n_docs = len(language_doc_dict[language])
    docs = language_doc_dict[language]
    #your code here
    tokens_n = []

    for doc in docs:
        path_to_doc = f'{doc}/{language}.drs.xml'
        tokens = get_tokens(path_to_doc)
        length = len(tokens)
        tokens_n.append(length)
    n_tokens = sum(tokens_n)
    print(f'{language}: num docs: {n_docs}, num tokens: {n_tokens}')

pairs = get_pairs(languages)
print(pairs)

for lang1, lang2 in pairs:
    docs_lang1 = language_doc_dict[lang1]
    docs_lang2 = language_doc_dict[lang2]
    #print(len(docs_lang1))
    #print(len(docs_lang2))
    number_of_docs = len(docs_lang1.intersection(docs_lang2))
    #print(len(docs_in_two))
Esempio n. 19
0
def preprocess(doc):
    tokens = utils.get_tokens(doc)
    tokensWOStopWords = utils.removeStopWords(tokens)
    cleanWords = utils.cleanText(tokensWOStopWords)
    return cleanWords
Esempio n. 20
0
def build_vocab(data: Iterable[str],
                num_words: Optional[int] = None,
                min_count: int = 1,
                pad_to_multiple_of: Optional[int] = None) -> Vocab:
    """
    Creates a vocabulary mapping from words to ids. Increasing integer ids are assigned by word frequency,
    using lexical sorting as a tie breaker. The only exception to this are special symbols such as the padding symbol
    (PAD).

    :param data: Sequence of sentences containing whitespace delimited tokens.
    :param num_words: Optional maximum number of words in the vocabulary.
    :param min_count: Minimum occurrences of words to be included in the vocabulary.
    :param pad_to_multiple_of: If not None, pads the vocabulary to a size that is the next multiple of this int.
    :return: Word-to-id mapping.
    """
    vocab_symbols_set = set(C.VOCAB_SYMBOLS)
    raw_vocab = Counter(token for line in data
                        for token in utils.get_tokens(line)
                        if token not in vocab_symbols_set)
    # For words with the same count, they will be ordered reverse alphabetically.
    # Not an issue since we only care for consistency
    pruned_vocab = [
        w for c, w in sorted(((c, w)
                              for w, c in raw_vocab.items() if c >= min_count),
                             reverse=True)
    ]

    if num_words is not None:
        vocab = list(islice(pruned_vocab, num_words))
        num_words_log = str(num_words)
    else:
        vocab = pruned_vocab
        num_words_log = "None"

    if pad_to_multiple_of is not None:
        current_vocab_size = len(vocab) + len(C.VOCAB_SYMBOLS)
        rest = current_vocab_size % pad_to_multiple_of
        padded_vocab_size = current_vocab_size if rest == 0 else current_vocab_size + pad_to_multiple_of - rest
        logger.info("Padding vocabulary to a multiple of %d: %d -> %d",
                    pad_to_multiple_of, current_vocab_size, padded_vocab_size)
        pad_entries = [
            C.PAD_FORMAT % idx
            for idx in range(current_vocab_size, padded_vocab_size)
        ]
        pad_to_multiple_log = str(pad_to_multiple_of)
    else:
        pad_entries = []
        pad_to_multiple_log = "None"

    word_to_id = {
        word: idx
        for idx, word in enumerate(chain(C.VOCAB_SYMBOLS, vocab, pad_entries))
    }
    logger.info(
        "Vocabulary: types: %d/%d/%d/%d (initial/min_pruned/max_pruned/+special) "
        + "[min_frequency=%d, max_num_types=%s, pad_to_multiple_of=%s]",
        len(raw_vocab), len(pruned_vocab), len(vocab), len(word_to_id),
        min_count, num_words_log, pad_to_multiple_log)

    # Important: pad symbol becomes index 0
    assert word_to_id[C.PAD_SYMBOL] == C.PAD_ID
    return word_to_id