Esempio n. 1
0
    def decode(self, src_sentence):
        """Decodes a single source sentence using beam search. """
        self.count = 0
        self.time = 0
        self.initialize_predictor(src_sentence)
        hypos = self._get_initial_hypos()
        it = 1
        if self.reward:
            self.l = len(src_sentence)
        while not self.stop_criterion(
                utils.flattened(hypos)) and it < self.max_len:
            it = it + 1
            next_hypos = []
            for i, group in enumerate(hypos):
                next_group = []
                for hypo in group:
                    if hypo.get_last_word() == utils.EOS_ID:
                        next_group.append(hypo)
                        continue
                    for next_hypo in self._expand_hypo(hypo):
                        next_group.append(next_hypo)
                next_hypos.append(
                    self._get_next_hypos(next_group, self.group_sizes[i],
                                         next_hypos))
            hypos = next_hypos

        return self.get_full_hypos_sorted(utils.flattened(hypos))
Esempio n. 2
0
def find_files_to_process():
    files_from_crawler = list(flattened(recursive_listdir(DOWNLOAD_DIR)))

    files_to_process = []
    files_to_ignore = []
    for path in files_from_crawler:
        try:
            import_date = find_date(path)
            size = os.path.getsize(path)
            files_to_process.append((path, import_date, os.path.getsize(path)))
        except ValueError:
            files_to_ignore.append(path)

    def _import_date((_1, import_date, _2)):
        return import_date

    def _size((_1, _2, size)):
        return size

    bytes_accumulator = Accumulator()
    files_to_process.sort(key=_import_date)
    files_to_process = [(f, bytes_accumulator(_size(f)))
                        for f in files_to_process]
    bytes_to_process = bytes_accumulator.getvalue()

    return (bytes_to_process, files_to_process, files_to_ignore)
Esempio n. 3
0
 def _get_next_hypos(self, all_hypos, size, other_groups=None):
     """Get hypos for the next iteration. """
     all_scores = np.array([self.get_adjusted_score(hypo) for hypo in all_hypos])
     if other_groups:
         all_scores = all_scores + self.lmbda*self.hamming_distance_penalty(all_hypos, 
                                                         utils.flattened(other_groups))
     inds = utils.argmax_n(all_scores, size)
     return [all_hypos[ind] for ind in inds]
Esempio n. 4
0
def find_files_to_process():
    files_from_crawler = list(flattened(recursive_listdir(DOWNLOAD_DIR)))

    files_to_process = []
    files_to_ignore = []
    for path in files_from_crawler:
        try:
            import_date = find_date(path)
            size = os.path.getsize(path)
            files_to_process.append((path, 
                                     import_date,
                                     os.path.getsize(path)))
        except ValueError:
            files_to_ignore.append(path)

    def _import_date((_1, import_date, _2)): return import_date
    def _size((_1, _2, size)): return size
    bytes_accumulator = Accumulator()
    files_to_process.sort(key=_import_date)
    files_to_process = [(f, bytes_accumulator(_size(f)))
                        for f in files_to_process]
    bytes_to_process = bytes_accumulator.getvalue()

    return (bytes_to_process, files_to_process, files_to_ignore)
Esempio n. 5
0
        except (ValueError, ImportError), err:
            return False

    re_agency = re.compile('^[0-9]*[A-Z]+')

    def extract_prefix(filename):
        prefix_match = re_agency.match(filename.upper())
        if not prefix_match is None:
            prefix = prefix_match.group()
            return fix_prefix(prefix)
        else:
            return None

    files_to_process = filter(
        filename_has_date,
        map(os.path.basename, flattened(recursive_listdir(DOWNLOAD_DIR))))
    prefixes = map(extract_prefix, files_to_process)

    def unique(iterable):
        def combine(accum, item):
            accum[item] = None
            return accum

        return reduce(combine, iterable, {}).keys()

    def frequency(iterable):
        def combine(frequencies, item):
            cnt = frequencies.get(item, 0)
            frequencies[item] = cnt + 1
            return frequencies
Esempio n. 6
0
        try:
            import_date = find_date(filename)
            return True
        except (ValueError, ImportError), err:
            return False

    re_agency = re.compile('^[0-9]*[A-Z]+')
    def extract_prefix(filename):
        prefix_match = re_agency.match(filename.upper())
        if not prefix_match is None:
            prefix = prefix_match.group()
            return fix_prefix(prefix)
        else:
            return None

    files_to_process = filter(filename_has_date, map(os.path.basename, flattened(recursive_listdir(DOWNLOAD_DIR))))
    prefixes = map(extract_prefix, files_to_process)

    def unique(iterable):
        def combine(accum, item):
            accum[item] = None
            return accum
        return reduce(combine, iterable, {}).keys()

    def frequency(iterable):
        def combine(frequencies, item):
            cnt = frequencies.get(item, 0)
            frequencies[item] = cnt + 1
            return frequencies
        return reduce(combine, iterable, {})