def _load_stations(): filepath = os.path.join(os.path.dirname(__file__), 'stations.dat') stations = {} with open(filepath, 'rb') as f: for line in f.readlines(): name, telecode = line.split() stations[to_unicode(name)] = to_unicode(telecode) return stations
def test_should_create_sphinx_header(self): """ Should create Sphinx header. """ os.chdir(self.var_path) with HookStdOut(): self.app.create_project('testproject') view_path = join(self.var_path, 'testproject', 'views', 'frontend', 'index.py') with open(view_path, 'rb') as f: lines = f.readlines() self.assertEqual(to_unicode(lines[2].lstrip().rstrip()), 'testproject.views.frontend.index') self.assertEqual(to_unicode(lines[3].lstrip().rstrip()), '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
def to_sentences(self, paragraph): if hasattr(self._sentence_tokenizer, '_params'): extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get( self._language, []) self._sentence_tokenizer._params.abbrev_types.update( extra_abbreviations) sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph)) return tuple(map(unicode.strip, sentences))
def __init__(self, words, tokenizer=None): if isinstance(words, string_types) and tokenizer is None: raise ValueError( "the tokenizer must be called if 'words' is not a sequence.") elif isinstance(words, string_types): words = tokenizer.to_words(to_unicode(words)) elif not isinstance(words, Sequence): #error handling raise ValueError( "Parameter 'words' has to be sequence or string with tokenizer given." ) self._terms = Counter(map(unicode.lower, words)) self._max_frequency = max(self._terms.values()) if self._terms else 1
def cli(): # Parse the command-line arguments. arguments = docopt(__doc__) from_station_code = stations.get(to_unicode(arguments['<from>'])) if not from_station_code: print('Seems that no this station where you from.') exit() to_station_code = stations.get(to_unicode(arguments['<to>'])) if not to_station_code: print('Seems that no this station where you going to.') exit() valid_date = get_valid_date(to_unicode(arguments['<date>'])) if not valid_date: print('Not a valid date.') exit() # Transform valid options to a string. opts = ''.join(o[1] for o in arguments if o in '-d-g-k-t-z' and arguments[o]) params = build_params(from_station_code, to_station_code, valid_date) try: resp = requests.get(QUERY_URL, params=params, verify=False) except ConnectionError: print(colorit('red', 'Network connection fail.')) exit() try: rows = resp.json()['data']['datas'] except KeyError: print(colorit('green', 'No train available.')) trains = TrainsCollection(rows, opts) trains.export()
def null_stemmer(object): return to_unicode(object).lower()
def __init__(self, text, tokenizer, is_heading=False): self._text = to_unicode(text).strip() self._tokenizer = tokenizer self._is_heading = bool(is_heading)
def normalize_word(self, word): return to_unicode(word).lower()
def parse_stop_words(data): return frozenset(w.rstrip() for w in to_unicode(data).splitlines() if w)
def to_words(self, sentence): words = self._word_tokenizer.tokenize(to_unicode(sentence)) return tuple(filter(self._is_word, words))