Ejemplo n.º 1
0
def _read_coords(rawtext, **kwargs):
    "Get coordinates from the notice."
    rawtext = _strip_ws(rawtext)

    rawcoords = re.findall(MINUTE_COORDS, rawtext)
    if len(rawcoords) > 0:
        return _clean_minute_coords(rawcoords, **kwargs)

    out = []

    rawcoordsd = re.findall(DECIMAL_COORDS, rawtext)
    for lat, lng in window(rawcoordsd, n=2):
        if lat and lng:
            try:
                out.append((float(lat), float(lng)))
            except ValueError:
                pass

    for lat, lng in re.findall(PARANTHETICAL_COORDS, rawtext):
        out.append((float(lat), float(lng)))

    for _lat, latdir, _lng, lngdir in re.findall(PORTLAND_COORDS, rawtext):
        lat = abs(float(_lat)) * (1 if 'n' in latdir.lower() else -1)
        lng = abs(float(_lng)) * (-1 if 'w' in lngdir.lower() else 1)
        out.append((lat, lng))

    return out
Ejemplo n.º 2
0
def makefile_to_slides(fp):
    for line in fp:
        if line.startswith('all') or line == '\n':
            continue
        if 'wget' in line:
            continue

        if not line.startswith('\t'):
            title = line.split(':')[0]
        else:
            tokens = line.strip().split()
            lines = tokens[0]

            for prevtoken, thistoken in window(tokens, 2):
                if prevtoken.startswith('--') and not thistoken.startswith('--'):
                    lines += ' \\\n  %s %s' % (prevtoken, thistoken)
                elif prevtoken.startswith('--') and thistoken.startswith('--'):
                    pass
                elif not prevtoken.startswith('--') and not thistoken.startswith('--'):
                    if len(lines.split('\n')[-1] + thistoken) < 15:
                        lines += ' %s' % thistoken
                    else:
                        lines += ' \\\n  %s' % thistoken
                    
            yield title, lines
Ejemplo n.º 3
0
def test_window():
    observed = list(map(tuple,window(range(8), n = 3)))
    expected = [
        (0,1,2),
        (1,2,3),
        (2,3,4),
        (3,4,5),
        (4,5,6),
        (5,6,7),
    ]
    n.assert_list_equal(observed, expected)
Ejemplo n.º 4
0
def character(text):
    lines = window(line.strip() for line in re.split(r'[\r\n]', text))
    out = ''
    output = False
    i = 0
    for l, r in lines:
        if 'Full Public Notice' in l and 'Project Plans' in l:
            output = True
        elif output and i < 10 and r.strip():
            out += ' ' + r.strip()
            i += 1
    return out.lstrip()
Ejemplo n.º 5
0
def applicant(text):
    lines = (line.strip() for line in re.split(r'[\n\r]+', text) \
             if line.strip())
    result = None
    for l, r in window(lines):
        m = re.match(r'^(?:name of |)applicant:(.*)',
                     l or '', flags=re.IGNORECASE)
        if m:
            remainder = m.group(1).strip()
            if remainder:
                result = remainder
            else:
                result = r
            break
    if result:
        return re.split(APPLICANT_REGEXES['shorten'],
                        result, maxsplit=0)[0].strip()
Ejemplo n.º 6
0
def amici(brief:str) -> list:
    for member, result in MANUAL_OVERRIDE:
        if re.match(member, brief):
            return result

    _amicus_regex = re.compile(r'(?:amicus brief|amici brief|amici curiae|amicus curiae|motion for leave to file and brief)(?: of)?', flags = re.IGNORECASE)
    amici_section = _remove_date(brief)
    amici_section = re.sub(r'[0-9]+\. +Brief,', '', amici_section, flags = re.IGNORECASE)

    match = re.search(_amicus_regex, amici_section)
    if match != None and match.start() < 30:
        amici_section = amici_section[match.end():]
    match = re.search(_amicus_regex, amici_section)
    if match != None and match.start() > len(amici_section)*2/5:
        amici_section = amici_section[:match.start()]

    amici_section = re.sub(r' in support of .*', '', amici_section, flags = re.IGNORECASE)

    # Inc.The
    amici_section = re.sub(r'([^ ])The ', r'\1, The ', amici_section)

    onlycomma = r'(?:,| and the| and other) '
    l = amici_section.lower()
    if l.count(';') > 0:
        _regex = r'; '
    elif l.count(',') > 3 or ', and' in l or l.count(',') > l.count('and') or l.count(',') == l.count(', inc'):
        _regex = onlycomma 
    else:
        _regex = r'(?:,| and) '
    amicus_separator = re.compile(_regex, flags = re.IGNORECASE)

    def clean(result):
        r = result.strip()
        r = re.sub(r' as ?$', '', r, flags = re.IGNORECASE)
        r = re.sub(r'^ ?(for|of|and|amic(i|us) curiae) ?', '', r, flags = re.IGNORECASE)
        match = re.search(r'brief(?: for| of)?(?: the)?(?: amic(?:us|i) curiae)? ', r, flags = re.IGNORECASE)
        if match and match.start() < len(result) / 2:
            return r[match.end():]
        elif match and match.end() > len(result) / 2:
            return r[:match.start()]
        else:
            return r

    # Clean twice
    results = map(clean, map(clean, _amicus(unidecode(amici_section), amicus_separator, 0)))
    slider = window(chain([''], results, ['']), n = 3)
    out = []
    for previous_result, current_result, next_result in slider:
        if re.match(r'^(|as|amic(i|us) curiae)$', current_result, flags = re.IGNORECASE):
            pass
        elif re.match(r'^[^a-z]{0,2}(inc|jr)[^a-z]{0,2}', next_result, flags = re.IGNORECASE):
            out.append(current_result + ', ' + next_result)
            next(slider)
        elif re.match(r'^et al\.?,?$', next_result, flags = re.IGNORECASE):
            out.append(current_result + ', ' + next_result)
            next(slider)
        else:
            out.append(current_result)

    if len(out) >= 3 and _regex == onlycomma:
        out = out[:-1] + re.split(r' and ', out[-1], flags = re.IGNORECASE)

    def finalize(result):
        result = re.sub(r'^ ?(amici|of )', '', result, flags = re.IGNORECASE)
        result = re.sub(r' (on behalf of).*$', '', result, flags = re.IGNORECASE)
        result = re.sub(r',$', '', result)
        return result
    return list(map(finalize, out))
Ejemplo n.º 7
0
def derivative(counter):
    dcounter = Counter()
    for left, right in window(itertools.chain([min(counter) - 1], sorted(counter))):
        dcounter[right] = counter[right] - counter[left]
    return dcounter
Ejemplo n.º 8
0
    def process(self, root_dir, **kwargs):
        """ TODO """
        # Read keyword arguments
        est_curpair = kwargs.get("estimate_curpair", "EURUSD")
        feature_curpairs = kwargs.get(
            "feature_curpairs",
            [
                "AUDJPY",
                "AUDNZD",
                "AUDUSD",
                "CADJPY",
                "CHFJPY",
                "EURCHF",
                "EURGBP",
                "EURJPY",
                "EURUSD",
                "GBPJPY",
                "GBPUSD",
                "NZDUSD",
                "USDCAD",
                "USDCHF",
                "USDJPY",
            ],
        )
        output_file = kwargs.get("output", "output")
        window_size = kwargs.get("window_size", 3)
        timeslot_len = timedelta(minutes=kwargs.get("timeslot_len", 60))
        self.__remove_timebounds = set()

        # Extract zip files from directory
        zipfiles = [
            [os.path.join(root_dir, curpair, zipfile) for zipfile in os.listdir(os.path.join(root_dir, curpair))]
            for curpair in os.listdir(root_dir)
        ]

        with open(output_file, "w") as fp:
            months = len(zipfiles[0])  # number of months in given dataset
            for month in xrange(months):
                print "working in %sth month" % month
                # get zip files of current month
                month_zipfiles = [e[month] for e in zipfiles]

                # initialize data
                data = {}

                for filename in month_zipfiles:
                    with self.__unarchive_zipfile(filename) as file_obj:
                        curpair = self.__get_currency_pair(filename)
                        if curpair not in feature_curpairs:
                            continue
                        print "Aggregating %s" % filename
                        self.__aggregate_data(file_obj, self.__get_currency_pair(filename), timeslot_len, data)

                data = {k: v for k, v in data.iteritems() if k not in self.__remove_timebounds}

                # Iterate data with a sliding window, for each sequence, the
                # first n items serve as feature and last item serves as label
                data_it = iter(data[i] for i in sorted(data))
                for seq in window(data_it, window_size + 1):
                    print "Generaing log..."
                    self.__print_result(fp, seq, est_curpair)