Python AsciiDammit 예제들, AsciiDammit Python 예제들

예제 #1

0

파일 보기

파일: csv_example.py 프로젝트: BrianSipple/dedupe

def preProcess(column):
    """
    Do a little bit of data cleaning with the help of [AsciiDammit](https://github.com/tnajdek/ASCII--Dammit) 
    and Regex. Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = AsciiDammit.asciiDammit(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column

예제 #2

0

파일 보기

파일: csv_data_matching_example.py 프로젝트: TinaCloud/dedupe-gaptor

def preProcess(column):
    """
    Do a little bit of data cleaning with the help of [AsciiDammit](https://github.com/tnajdek/ASCII--Dammit) 
    and Regex. Things like casing, extra spaces, quotes and new lines can be ignored.
    """

    column = AsciiDammit.asciiDammit(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column

예제 #3

0

파일 보기

파일: csv_example.py 프로젝트: JeffDonovan/dedupe

def preProcess(column):
    """
    Our goal here is to find meaningful duplicates, so things like
    casing, extra spaces, quotes and new lines can be
    ignored. `preProcess` removes these.
    """

    column = AsciiDammit.asciiDammit(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column

예제 #4

0

파일 보기

파일: fetch.py 프로젝트: dkellerman/youtunes

def fetch(page=1, page_size=DEFAULT_PAGE_SIZE):
    """Loops through all purchased music tracks in the ITunes library,
       performs a YouTube search on all song titles and artist names,
       and returns a list of Entry objects indexed by video id"""

    tracks = get_itunes_tracks()
    num_tracks = len(tracks)
    results_per_track = len(TRACK_SEARCH_FIELDS) * RESULTS_PER_QUERY
    abs_track_index = int((page_size * (page-1)) / results_per_track)
    track_index = abs_track_index % num_tracks
    query_index = (int((abs_track_index+1) / num_tracks)*RESULTS_PER_QUERY) + 1
    query_index %= 1000 - RESULTS_PER_QUERY

    # cycle through the tracks
    tracks = itertools.islice(itertools.cycle(tracks), track_index, None)
    results = {}
    search_terms_used = []

    for track in tracks:
        for field in TRACK_SEARCH_FIELDS:
            search_term = track.get(field, None)
            if search_term is not None:
                # asciify search term and put it in quotes and lower case
                search_term = (AsciiDammit.asciiDammit(search_term)
                               .encode("ascii", "ignore")
                               .lower())
                search_term = '"%s"' % search_term.replace("\"", "'")

                # don't search for same term twice
                if not (field,search_term) in search_terms_used:
                    # skim first n items off search
                    max_results = min(RESULTS_PER_QUERY,
                                      page_size - len(results))
                    if max_results == 0:
                        break

                    entries = youtube_search([search_term],
                                             max_results=max_results,
                                             start_index=query_index)
                    search_terms_used.append((field,search_term))

                    # index results by id
                    for entry in entries:
                        id = entry.id.text.split('/')[-1]
                        logging.info("%s | %s" % (id, entry.media.title.text))
                        if id not in results:
                            results[id] = entry

        if len(results) >= page_size:
            break

    return results

예제 #5

0

파일 보기

파일: csvhelpers.py 프로젝트: johnlauck/csvdedupe

def preProcess(column):
    """
    Do a little bit of data cleaning with the help of
    [AsciiDammit](https://github.com/tnajdek/ASCII--Dammit) and
    Regex. Things like casing, extra spaces, quotes and new lines can
    be ignored.
    """
    if column.find('LatLong') == 0:
        try:
            vals = column[8:-1].split(',')
            column = (float(vals[0]), float(vals[1]))
        except ValueError:
            column = (0.0,0.0)
    else:
        column = AsciiDammit.asciiDammit(column)
        column = re.sub('  +', ' ', column)
        column = re.sub('\n', ' ', column)
        column = column.strip().strip('"').strip("'").lower().strip()

    return column

예제 #6

0

파일 보기

#speaker_events_dict[speakernosp] = []
tracks = []   # list of tracks
tracksdict = {}   # for use in creating a dictionary of tracks with no space in their name 
rooms = []
roomsdict = {}
speakerdict = {}
hourslist = {}
speaker_events_dict = {}
# go through the schedule row by row
for index, x in enumerate(pconsched.schedule):
    # (re)initialize a session dictionary object
    session = {} 
    # for each field grab the data and put it in the session dictionary
    for field in fields:
        # bring the field info into a variable to help keep the code clean
        fieldtext = AsciiDammit.asciiDammit(pconsched.schedule[index][pconsched.headerdict[field]])

        # separate the time and day into different variables
        
        session['input'] = x
        session['index'] = index
        ##start day and time assignments
        if  (field == "Start Date"):
            session['startday'] = fieldtext
            if (fieldtext == friday_date):
                session['dayheader'] = friday_header
            if (fieldtext == saturday_date):
                session['dayheader'] = saturday_header
            if (fieldtext == sunday_date):
                session['dayheader'] = sunday_header
            if (fieldtext == ''):

예제 #7

0

파일 보기

trackdict = {}

logging.debug("schedule in list form: \n%s", build)

for line in build:
    logging.debug("line: %s", line)
    logging.debug(len(line))
    atrac = line[5].split(',')[0]
    for currenttrac in trackinfo:
        if currenttrac.keys()[0][0:3] == atrac[0:3].upper():
            logging.debug("track: %s", atrac[0:3].upper())
            logging.debug("current track: %s", currenttrac.keys()[0])
            logging.debug("line[6] %s", line[6])
            trackcontent = [
                AsciiDammit.asciiDammit(
                    line[6]
                ),  # .decode('unicode_escape').encode('ascii', 'ignore'),
                AsciiDammit.asciiDammit(line[8]),
                AsciiDammit.asciiDammit(line[7]),
            ]  # .decode('unicode_escape').encode('ascii', 'ignore')]
            if currenttrac.keys()[0] not in trackdict.keys():
                trackdict[currenttrac.keys()[0]] = [trackcontent]
            else:
                if trackcontent not in trackdict[currenttrac.keys()[0]]:
                    trackdict[currenttrac.keys()[0]].append(trackcontent)
    logging.debug(line)

for track in trackdict:
    trackdict[track].sort()
logging.debug("sorted dict's list\n\n")
logging.debug("track dictionary: %s", trackdict)

예제 #8

0

파일 보기

파일: schedcsv.py 프로젝트: penguiconscheduling/schedcsv

# speaker_events_dict[speakernosp] = []
tracks = []   # list of tracks
tracksdict = {}   # for use in creating a dictionary of tracks with no space in their name
rooms = []
roomsdict = {}
speakerdict = {}
hourslist = {}
speaker_events_dict = {}
# go through the schedule row by row
for index, x in enumerate(pconsched.schedule):
        # (re)initialize a session dictionary object
    session = {}
    # for each field grab the data and put it in the session dictionary
    for field in fields:
        # bring the field info into a variable to help keep the code clean
        fieldtext = AsciiDammit.asciiDammit(
            pconsched.schedule[index][pconsched.headerdict[field]])

        # separate the time and day into different variables

        session['input'] = x
        session['index'] = index
        session['All Day Event'] = ''
        session['private'] = "PUBLIC"
        session['avneeds'] = ""
        session['alldayorder'] = "3"
        # start day and time assignments
# separate the time and day into different variables
        if (field == "event_start"):
            logging.debug(fieldtext[:len(fieldtext)-fieldtext.find(" ") + 1])
            dashdate = fieldtext[:len(fieldtext)-fieldtext.find(" ") + 1]
            slashdate = "{}/{}/{}".format(dashdate[6:7],

예제 #9

0

파일 보기

파일: full_sched_web_template_filler.py 프로젝트: waldo323/schedcsv

    build = list(reader)

trackdict = {}

logging.debug("schedule in list form: \n%s", build)

for line in build:
    logging.debug("line: %s", line)
    logging.debug(len(line))
    atrac = line[5].split(',')[0]
    for currenttrac in trackinfo:
        if currenttrac.keys()[0][0:3] == atrac[0:3].upper():
            logging.debug("track: %s", atrac[0:3].upper())
            logging.debug("current track: %s", currenttrac.keys()[0])
            logging.debug("line[6] %s", line[6])
            trackcontent = [AsciiDammit.asciiDammit(line[6]),  # .decode('unicode_escape').encode('ascii', 'ignore'),
                            AsciiDammit.asciiDammit(line[8]),
                            AsciiDammit.asciiDammit(line[7]), ]  # .decode('unicode_escape').encode('ascii', 'ignore')]
            if currenttrac.keys()[0] not in trackdict.keys():
                trackdict[currenttrac.keys()[0]] = [trackcontent]
            else:
                if trackcontent not in trackdict[currenttrac.keys()[0]]:
                    trackdict[currenttrac.keys()[0]].append(trackcontent)
    logging.debug(line)

for track in trackdict:
    trackdict[track].sort()
logging.debug("sorted dict's list\n\n")
logging.debug("track dictionary: %s", trackdict)

env = jinja2.Environment(loader=jinja2.FileSystemLoader('templates'))