Ejemplo n.º 1
0
def main(input_filename_list=[]):
    # Make input filename list
    if input_filename_series and input_whole_folder:
        raise Exception("Series or folder. Can't have both. Sorry.")
    if input_filename_series:
        for g in os.listdir(input_path):
            if g.startswith(input_filename_list_range[0]):
                list_started = True
            if g.endswith('.wav') and list_started is True:
                input_filename_list.append(g)
                # If end of range reached
                if g.startswith(input_filename_list_range[1]):
                    break
        print 'Input filenames (series) ({0}):'.format(len(input_filename_list)), input_filename_list
    elif input_whole_folder:
        for f in os.listdir(input_path):
            if f.endswith('.wav'):
                input_filename_list.append(f)
        print 'Input filenames (folder) ({0}):'.format(len(input_filename_list)), input_filename_list
    else:
        input_filename_list = [input_filename]

    # Grab spreadsheet from Google Drive
    utils.export_csv('Items')

    # RUN THE TRAP    
    pool_array = [os.path.join(input_path, each) for each in input_filename_list]
    
    if pool_processing:
        pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
        pool.map(track_file, pool_array)
    else:
        for item in pool_array:
            track_file(item)
Ejemplo n.º 2
0
def convert_metadata(f,t):
    res=[]
    for line in utils.read_csv(f,' +++$+++ ') :
        filmid=line[0];filmtitle=line[1];filmyear=line[2][:4];filmrating=line[3];filmratingnum=line[4]
        for filmtype in utils.parse_json_array(line[5]):
            res.append([filmid,filmtitle,filmyear,filmrating,filmratingnum,filmtype])
    utils.export_csv(res,t,';')
def main():
    logging.basicConfig(level=log_level)
    input_list = []
    metadata_errors = []
    utils.export_csv('Items') # Grab spreadsheet from Google Drive
    if just_add_tags:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if os.path.splitext(each_file.lower())[1][1:] in digital_formats]
    elif whole_folder:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if each_file.lower().endswith('.wav')]
    elif some_orders:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and 
            each_file[:5] in orders_input)]
    elif some_items:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and \
            any(each_item_input in each_file for each_item_input in items_input))]

    # Halt program on metadata errors
    metadata_errors = [error for item in item_list for error in item.metadata_errors]
    if metadata_errors:
        for error in metadata_errors:
            print error
        print '{0} Metadata errors found! See above.'.format(len(metadata_errors))
        sys.exit()

    logging.info('item_list ({0}): {1}'.format(len(item_list), [item.name for item in item_list]))

    item_list = format_list(item_list)

    #if cd_pdf_toggle: make_cd_pdf(item_list)
    if front_pdf_toggle: make_front_pdf(item_list)
    if back_pdf_toggle: make_back_pdf(item_list)
    if burn_cds_toggle: burn_cds(item_list)
    if digital_files_toggle: make_digital_files(item_list)
Ejemplo n.º 4
0
def sensitivity(years, kwLimit, min_edge_th):
    print('Sensitivity analysis for years ' + str(years))
    yearrange = years[0] + "-" + years[len(years) - 1]
    graph = pickle.load(
        open(
            'pickled/graph_' + yearrange + '_' + str(kwLimit) + '_eth' +
            str(min_edge_th) + '.pkl', 'rb'))

    dthvals = numpy.arange(0.01, 0.125, 0.005)
    ethvals = numpy.arange(10, 205, 5)
    #mincomsizevals=[0,4,10] # remove min com size, additional filtering does not makes really sense

    res = []
    for dth in dthvals:
        for eth in ethvals:
            print('eth = ' + str(eth) + ' ; dth = ' + str(dth))
            [fgraph, coms] = get_communities(graph, dth, eth)
            for i in range(len(coms)):
                comnum = len(coms[i].sizes())
                vcount = fgraph.vcount()
                modularity = coms[i].modularity
                res.append([dth, eth, comnum, vcount, modularity])
                print(
                    str(i) + " ; " + str(dth) + " ; " + str(eth) + " : " +
                    str(comnum) + " ; " + str(vcount) + " ; " +
                    str(modularity))
    # export res
    utils.export_csv(
        res, 'sensitivity/sensitivity_' + yearrange + '_' + str(kwLimit) +
        '_eth' + str(min_edge_th) + '.csv', ";",
        "dispth;eth;comnum;vcount;modularity")
Ejemplo n.º 5
0
def filter_events(data, filename="events.csv"):
    # process the description text for information
    events = {}
    for k, v in data.get('tickets', {}).items():
        descr = v.get('description')
        if descr:
            sentences = extract_event(descr)
            event = parse_clause(sentences)
            if len(event.get('condition')) > 0:
                for f in [
                        'name', 'summary', 'type', 'subtask', 'project',
                        'project key', 'status'
                ]:
                    event[f] = v.get(f)

                events[k] = event

    # export the events to a csv
    rows = []
    for k, v in events.items():
        row = {}
        row['key'] = k

        for t, c in v.items():
            if isinstance(c, list):
                row[t] = '\n'.join(c)
            else:
                row[t] = c

        rows.append(row)

    export_csv(data=rows, filename=filename)
    logger.info('Exported {} events'.format(len(rows)))
    return events
Ejemplo n.º 6
0
def print_order_notes(item_list=None, refresh_csv=True):
    if not item_list:
        # Input whole Tracked folder
        item_list = [Item(each_file) for each_file in os.listdir(config.tracked_folder) if os.path.splitext(each_file)[-1] == '.wav']

    if refresh_csv:
        utils.export_csv('Items') # Grab spreadsheet from Google Drive

    # items.csv item notes
    notes_lines = [' - items.csv -']
    for item in [item for item in item_list if not item.copy_counter and not item.side]:
        if item.customer_notes:
            notes_lines.append('{0} - Customer notes: {1}'.format(item.name, item.customer_notes))
        if item.private_notes and len(item.private_notes) > 3:
            notes_lines.append('{0} - Private notes: {1}'.format(item.name, item.private_notes))

    # WooCommerce order notes
    notes_lines.append(' - WC notes -')
    wc_client = WooCommerceClient(config.wc_ck, config.wc_cs, config.base_url, oauth_enabled=False)
    for order_id in set([item.order.lstrip('0') for item in item_list]):
        order = wc_client.get_order(order_id)['order']
        html_parser = HTMLParser.HTMLParser()
        notes_lines.append('{0} - {1}'.format(order_id, html_parser.unescape(order['note'])))

    # Print notes; write to text file
    for notes_line in notes_lines:
        print notes_line
    order_notes_file = os.path.join(config.order_notes_folder, 'order_notes.txt')
    with open(order_notes_file, 'w') as notes:
        for notes_line in notes_lines:
            notes.write(notes_line + '\n')

    # Open notes in default text editor
    os.startfile(order_notes_file)
Ejemplo n.º 7
0
def main(input_filename_list=[]):
    # Make input filename list
    if config.input_orders and config.input_items:
        raise Exception("Orders or items. Can't have both. Sorry.")
    for f in os.listdir(config.clean_folder):
        if f.endswith('.wav'):
            input_filename_list.append(f)
    print 'Input filenames (folder) ({0}):'.format(len(input_filename_list)), input_filename_list

    # Grab spreadsheet from Google Drive
    utils.export_csv('Items')

    # RUN THE TRAP
    pool_array = [os.path.join(config.clean_folder, each) for each in input_filename_list]
    
    if config.pool_processing:
        pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
        pool.map(track_file, pool_array)
    else:
        for item in pool_array:
            track_file(item)

    # Class-ify item input list
    item_list = [Item(each_file) for each_file in os.listdir(config.tracked_folder) if os.path.splitext(each_file)[-1] == '.wav']

    # Print order notes
    order.print_order_notes(item_list)
Ejemplo n.º 8
0
def main():
    logging.basicConfig(level=config.log_level)
    metadata_errors = []
    utils.export_csv("Items")  # Grab spreadsheet from Google Drive

    if config.just_add_tags:
        print "Just Add Tags selected."
        input_list = [
            each_file
            for each_file in os.listdir(config.tracked_folder)
            if os.path.splitext(each_file.lower())[1][1:] in digital_formats
        ]
    elif config.input_whole_folder:
        input_list = [
            each_file for each_file in os.listdir(config.tracked_folder) if each_file.lower().endswith(".wav")
        ]
    elif config.input_orders:
        print "Input orders:", config.input_orders
        input_list = [
            each_file
            for each_file in os.listdir(config.tracked_folder)
            if (each_file.lower().endswith(".wav") and each_file[:5] in config.input_orders)
        ]
    elif config.input_items:
        print "Input items:", config.input_items
        input_list = [
            each_file
            for each_file in os.listdir(config.tracked_folder)
            if (
                each_file.lower().endswith(".wav")
                and any(each_item_input in each_file for each_item_input in config.input_items)
            )
        ]

    # Class-ify item input list
    item_list = [Item(each_file, tracks_added=True) for each_file in input_list]

    # Halt program on metadata errors
    metadata_errors = [error for item in item_list for error in item.metadata_errors]
    if metadata_errors:
        for error in metadata_errors:
            print error
        print "{0} Metadata errors found! See above.".format(len(metadata_errors))
        sys.exit()

    logging.info("item_list ({0}): {1}".format(len(item_list), [item.name for item in item_list]))

    item_list = format_list(item_list)

    # if config.cd_pdf_toggle: make_cd_pdf(item_list)
    if config.front_pdf_toggle:
        make_front_pdf(item_list)
    if config.back_pdf_toggle:
        make_back_pdf(item_list)
    if config.burn_cds_toggle:
        burn_cds(item_list)
    order.print_order_notes(item_list, refresh_csv=False)
    if config.digital_files_toggle:
        make_digital_files(item_list)
Ejemplo n.º 9
0
def convert_metadata(f, t):
    res = []
    for line in utils.read_csv(f, ' +++$+++ '):
        filmid = line[0]
        filmtitle = line[1]
        filmyear = line[2][:4]
        filmrating = line[3]
        filmratingnum = line[4]
        for filmtype in utils.parse_json_array(line[5]):
            res.append([
                filmid, filmtitle, filmyear, filmrating, filmratingnum,
                filmtype
            ])
    utils.export_csv(res, t, ';')
Ejemplo n.º 10
0
def main(input_filename_list=[], output_path=[]):
    needs_split_points = []
    split_files_counter = 0
    if not output_path or output_path.lower == 'none':
        output_path = input_path

    # Make input filename list
    if input_filename_series and input_whole_folder:
        print 'Series or folder. Can\'t have both. Sorry.'
        raise SystemError
    if input_filename_series:
        for g in os.listdir(input_path):
            if g.startswith(input_filename_list_range[0]):
                list_started = True
            if g.endswith('.wav') and list_started is True:
                input_filename_list.append(g)
                # If end of range reached
                if g.startswith(input_filename_list_range[1]):
                    break
        print 'Input filenames (series) ({0}):'.format(len(input_filename_list)), input_filename_list
    elif input_whole_folder:
        for f in os.listdir(input_path):
            if f.endswith('.wav'):
                input_filename_list.append(f)
        print 'Input filenames (folder) ({0}):'.format(len(input_filename_list)), input_filename_list
    else:
        input_filename_list = [input_filename]

    # Grab spreadsheet from Google Drive
    utils.export_csv('Items')

    # RUN THE TRAP
    pool_array = [os.path.join(input_path, each) for each in input_filename_list]

    if pool_processing:    
        pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
        pool.map(split_cd, pool_array)
    else:
        for each in pool_array:
            results = split_cd(each)
            split_files_counter += results[0]
            if results[1]: needs_split_points.append(results[1])

    if needs_split_points:
        print 'Needs split points:', needs_split_points
    else:
        print 'CD splitting: great success! {0} files split.'.format(split_files_counter)
Ejemplo n.º 11
0
def npatent_years():
    mongo = pymongo.MongoClient(utils.get_parameter('mongopath', True, True))
    data = []
    window = int(utils.get_parameter('window-size'))
    for year in range(1976 + window - 1, 2013):
        print(year)
        years = map(lambda y: str(y),
                    range(int(year - window + 1), int(year + 1)))
        patents = mongo['patent']['keywords'].find(
            {"app_year": {
                "$in": years
            }}, no_cursor_timeout=True)
        npatents = patents.count()
        yearrange = str(years[0]) + "-" + str(years[len(years) - 1])
        data.append([yearrange, npatents])
    utils.export_csv(data, 'data/patentcount_window' + str(window) + '.csv',
                     ";", "yearrange;count")
def main():
    logging.basicConfig(level=log_level)
    input_list = []
    metadata_errors = []
    utils.export_csv('Items') # Grab spreadsheet from Google Drive
    if just_add_tags:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if os.path.splitext(each_file.lower())[1][1:] in digital_formats]
    elif whole_folder:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if each_file.lower().endswith('.wav')]
    elif some_orders:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and 
            each_file[:5] in orders_input)]
    elif some_items:
        item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and \
            any(each_item_input in each_file for each_item_input in items_input))]

    if metadata_errors:
        for error in metadata_errors:
            print error
        msg = '{0} Metadata errors found! See above.'.format(len(metadata_errors))
        raise Exception(msg)

    add_compilation_counters(item_list)
    add_image_counters(item_list)

    logging.info('Compilation items: {0}'.format([item.name for item in item_list if item.compilation_counter]))
    logging.info('Unique images: {0}'.format(len(set([item.image for item in item_list]))))
    logging.info('item_list ({0}): {1}'.format(len(item_list), [item.name for item in item_list]))
    
    '''
    print
    print '{0} artist: {1}'.format(item_list[0].name, item_list[0].artist)
    item_list[0].print_tracks()
    print
    '''

    if cd_pdf_toggle: make_cd_pdf(item_list)
    if front_pdf_toggle: make_front_pdf(item_list)
    if back_pdf_toggle: make_back_pdf(item_list)
    if digital_files_toggle: make_digital_files(item_list)
Ejemplo n.º 13
0
def main(input_resplit_list=False):
    def add_serial_metadata(input_filename):
        def discogs_auth():
            authorize_token = None
            while not authorize_token:
                discogs = discogs_client.Client(CLIENT_NAME)
                discogs.set_consumer_key(CONSUMER_KEY, CONSUMER_SECRET)
                authorize_url = discogs.get_authorize_url()
                webbrowser.open(authorize_url[2])
                authorize_token = raw_input('Enter authorize token (Or q to quit): ')
            if authorize_token.lower() == 'q': return 'q'
            access_token = discogs.get_access_token(authorize_token)
            logging.debug('access_token: {0}'.format(access_token))
            return discogs

        def average(x):
            assert len(x) > 0
            return float(sum(x)) / len(x)

        def pearson_def(x, y):
            """
            Correlation between listed Discogs release durations and split FLAC file durations.
            Would divide by zero if all durations are equal for x or y; set to return 1 instead.
            """
            assert len(x) == len(y)
            n = len(x)
            assert n > 0
            if n == 2: # Pearson's is useless for n < 3
                return (min(x[0], y[0]) / float(max(x[0], y[0]))) * (min(x[1], y[1]) / float(max(x[1], y[1])))
            avg_x = average(x)
            avg_y = average(y)
            diffprod = 0
            xdiff2 = 0
            ydiff2 = 0
            for idx in range(n):
                xdiff = x[idx] - avg_x
                ydiff = y[idx] - avg_y
                diffprod += xdiff * ydiff
                xdiff2 += xdiff * xdiff
                ydiff2 += ydiff * ydiff
            return 1 if xdiff2 == 0 or xdiff2 == 0 else diffprod / math.sqrt(xdiff2 * ydiff2)

        def to_seconds(time_str_input):
            to_seconds_output = []
            if type(time_str_input) is list:
                for i in time_str_input:
                    i_split = i.split(':')
                    to_seconds_output.append(int(i_split[0] or 0)*60 + int(i_split[1]))
                return to_seconds_output
            elif type(time_str_input) is str or unicode:
                if ':' not in time_str_input:
                    return int(time_str_input)
                i_split = time_str_input.split(':')
                return int(i_split[0] or 0)*60 + int(i_split[1])
            elif type(time_str_input) is int:
                return time_str_input
            else:
                raise Exception('Invalid to_seconds input type: {0}'.format(type(time_str_input)))

        def boxes_pull(input_filename):
            # Pull artist, album info from boxes csv
            double_type = None
            input_filename = input_filename.lstrip('0')
            input_filename = input_filename.rsplit('-')[0]
            with open(config.boxes_path, 'r') as boxes:
                spamreader = csv.reader(boxes)
                rowdata = []
                for row in spamreader:
                    rowdata.append(row)
                for row in rowdata:
                    row_serial = row[0].translate(None,' ').lower()
                    if row_serial == input_filename:
                        print 'Match!'
                        print 'RealRow:', row
                        artist = row[1]
                        album = row[2]
                        if album.lower() == 'self titled' or album.lower() == 'self-titled':
                            album = artist
                        if row[5].lower() == 'x':
                            print 'Double trouble!'
                            if row[6].startswith('1/2') or row[6] == '':
                                print '1/2!'
                                double_type = '1/2'
                            elif row[6].startswith('1/4'):
                                print 'Eeeek! 1/4!!'
                                double_type = '1/4'
                        elif row[5].lower() and row[5].lower() != 'x':
                            double_type = 'other'
                        if 'live' in row[4].lower():
                            print 'Eeeek! Live album!'
                        return artist, album, double_type
                else:
                    return None, None, None

        filename_matches = []
        log_comment = ''
        artist = ''
        album = ''
        track_lengths_correlation = 0
        discogs_match = False
        resplit_serial = False
        requested_serial = False

        print
        print '----------------------'
        print 'Query:', input_filename

        # Check for filename query matches
        for each_file in os.listdir(config.split_folder):
            if each_file.startswith(input_filename):
                # Skip 'a' matches for non-'a' files
                if len(input_filename) == 5 and each_file[5] != '_':
                    continue            
                filename_matches.append(each_file)
        if not filename_matches:
            print 'No split (_xx) audio file matches! Dork.'
            return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
        
        # Check for artist tags, add from spreadsheet if enabled and necessary
        if config.use_boxes_csv:
            artist, album, double_type = boxes_pull(input_filename)
            if (artist, album) == (None, None):
                log_comment = 'No spreadsheet match found.'
                return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
            elif double_type == 'other':
                log_comment = 'Freaky double! Engage manual tagging mode.'
                if not input_resplit_list:
                    return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
            # Add tracknumber, album, artist to FLAC files
            for each_file in filename_matches:
                audio = FLAC(os.path.join(config.split_folder, each_file))
                audio['artist'] = artist
                audio['album'] = album
                audio.save()
        else:
            try:
                artist = FLAC(os.path.join(config.split_folder, filename_matches[0]))['artist'][0]
                album = FLAC(os.path.join(config.split_folder, filename_matches[0]))['album'][0]
                print 'Metadata found. Artist:', artist, 'Album:', album
            except:
                print 'No artist/album tags; boxes spreadsheet not searched:', input_filename
                return input_filename, artist, album, track_lengths_correlation, log_comment

        discogs_search = True
        if input_resplit_list:        
            # Check re-split_list.csv for specified Discogs release ID
            with open(resplit_list_path, 'r') as resplit_list:
                rowdata = [row for row in csv.reader(resplit_list)]
                for row in rowdata:
                    row_serial = row[0].translate(None,' ').lower()
                    if row_serial.startswith(input_filename.lstrip('0')):
                        if 'csv' in row[1].lower():
                            print 'Skipped: track titles in resplit-track-lists.csv'                  
                            return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
                        elif row[1].isdigit():
                            print 'Requested serial from resplit list: {0}'.format(row[1])
                            resplit_serial = True
                            output = [discogs.release(int(row[1]))]
                            discogs_search = False
                            break
                        elif row[1]:
                            print 'Re-split serial field non-numeric!'
        if discogs_search:
            query = ''.join(ch for ch in (artist+' '+album) if ch.isalnum() or ch in ' -/,\'')
            # Search Discogs for release ID/artist + album
            output = discogs.search(query, type='release')
            print 'Discogs results for "{0}":'.format(query)

        # Print track listing, tag FLAC files with titles
        accept_blank_tracklist_durations = False
        for search_loop in range(2):
            if accept_blank_tracklist_durations and requested_serial:
                output = discogs.search(query, type='release')
            time.sleep(config.discogs_request_interval) # Wait to comply with Discogs 60 requests/minute limit
            try:
                for i, result in enumerate(output):
                    if i >= config.max_search_tries:
                        accept_blank_tracklist_durations = True
                        print 'No results with track times found! Let\'s get a bit fuzzier...'
                        break
                    time.sleep(config.discogs_request_interval) # Wait to comply with Discogs 60 requests/minute limit
                    if hasattr(result, 'tracklist'): #and type(result) != discogs_client.Master:
                        if resplit_serial or accept_blank_tracklist_durations:
                            tracklist = [track for track in result.tracklist]
                        else:
                            tracklist = [track for track in result.tracklist if track.duration and to_seconds(track.duration) > 0]
                        if not tracklist:
                            print 'Result {0}: No tracks found. Continuing...'.format(str(i + 1))
                            continue
                        elif len(tracklist) != len(result.tracklist):
                            print 'Result {0}: Some tracks missing durations. Be careful!'.format(str(i + 1))
                        elif len(tracklist) == len(filename_matches):
                            print '----------------------'
                            print 'Result', str(i + 1)
                            print 'Release ID:', result.data['id']
                            print 'Artist:', result.artists[0].name.encode('utf-8')
                            print 'Album:', result.title.encode('utf-8')

                            # Funky double handling
                            if double_type in ['1/4', '1/3'] and filename_matches[0][-7] not in 'abcd':
                                if not result.tracklist[0].position:
                                    print 'Result {0}: Nope! (No position info found in Discogs)'.format(str(i + 1))
                                    continue
                                if not result.tracklist[0].position[0].isalpha():
                                    print 'Result {0}: Nope! (No side info in Discogs positions)'.format(str(i + 1))
                                    continue
                                if double_type == '1/4':
                                    sort_key = ['a', 'd', 'b', 'c']
                                if double_type == '1/3':
                                    sort_key = ['a', 'c', 'b', 'd']
                                # Reorder tracklist by alpha position key
                                tracklist_sorted = []
                                for key in sort_key:
                                    for track in tracklist:
                                        if track.position.lower().startswith(key):
                                            tracklist_sorted.append(track)
                                tracklist = tracklist_sorted

                            if tracklist[0].duration and not resplit_serial:
                                # Check correlation of Discogs track lengths with those of FLAC files
                                discogs_lengths = [track.duration for track in tracklist if track.duration]
                                flac_lengths = []
                                for match in filename_matches:
                                    audio = FLAC(os.path.join(config.split_folder, match))
                                    flac_length = time.strftime('%M:%S', time.gmtime(audio.info.length)).lstrip('0')
                                    flac_lengths.append(flac_length)
                                track_lengths_correlation = round(pearson_def(to_seconds(discogs_lengths), to_seconds(flac_lengths)),4)
                                print 'Track lengths correlation:', track_lengths_correlation
                                if track_lengths_correlation < config.min_correlation:
                                    print 'Result {0}: Low correlation. Best check yoself!'.format(str(i + 1))
                                    continue

                            # Write tags to FLAC files
                            discogs_match = True
                            for track, match in zip(tracklist, filename_matches):
                                audio = FLAC(os.path.join(config.split_folder, match))
                                audio['tracknumber'] = track.position
                                audio['title'] = track.title
                                if 'artists' in track.data:
                                    # Add artist info to compilation album tracks
                                    audio['artist'] = track.data['artists'][0]['name'].split('(')[0].strip()
                                flac_length = time.strftime('%M:%S', time.gmtime(audio.info.length)).lstrip('0')
                                print track.position, track.title.encode('utf-8'), track.duration, '-->', match, flac_length
                                audio.save()
                            return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
                        else:
                            print 'Result '+str(i + 1)+': Nope! ('+str(len(tracklist))+' != '+str(len(filename_matches))+')'
                else:
                    if len(output) == 0: # Don't retry fuzzy-style if there weren't any Discogs query matches
                        log_comment = 'No matches for Discogs query!'
                        return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
                    elif search_loop == 0:
                        accept_blank_tracklist_durations = True
                        print 'No results with track times found! Let\'s get a bit fuzzier...'
            except Exception as e: # httplib.BadStatusLine?
                print 'httplib.BadStatusLine (?) error:', e
                return None
        else:
            log_comment = 'No proper Discogs matches! Dork.'
        return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match

    input_filename_list = []
    matches_count = 0
    start_time = time.time()
    discogs_logged_in = False
    tries = 0
    runtime_errors = []

    # Logging setup
    if config.log_to_file:
        logging.basicConfig(filename=os.path.join(config.log_folder, 'log.csv'), format='%(levelname)s,%(message)s', level=logging.INFO)
    else:
        logging.basicConfig(format='%(levelname)s,%(message)s', level=logging.DEBUG)

    # Log into Discogs
    if not DISCOGS_MANUAL_AUTH:
        discogs = discogs_client.Client(CLIENT_NAME, CONSUMER_KEY, CONSUMER_SECRET, TOKEN, SECRET)
    else:
        while not discogs_logged_in:
            try:
                discogs = discogs_auth()
                if discogs == 'q': sys.exit()
                discogs_logged_in = True
            except:
                print 'Nope! Try again.'

    # Collect, print input filenames
    if config.serial_series:
        for f in config.serial_series:
            input_filename_list.append(str(f).zfill(5))
        print 'Input filenames (series):', input_filename_list
    elif config.input_whole_folder:
        for f in os.listdir(config.split_folder):
            if f.endswith('.flac') or f.endswith('.mp3'):
                f = f.rsplit('_clean')[0]
                if f[-2] == '-':
                    f = f[:-1]
                if f not in input_filename_list:
                    input_filename_list.append(f)
        print 'Input filenames ('+str(len(input_filename_list))+'):', input_filename_list
    elif type(input_filenames) is not list:
        input_filename_list = input_filenames.split(',')
    else:
        input_filename_list = input_filenames
    files_count = len(input_filename_list)

    # Grab spreadsheet from Google Drive
    utils.export_csv('Boxes!')

    # Do the things
    while tries == 0 or runtime_errors and tries <= config.max_error_tries:
        if runtime_errors:
            print 'Runtime errors:', runtime_errors
            print '{0} runtime errors found! See above.'.format(len(runtime_errors))
            input_filename_list = runtime_errors
            runtime_errors = []
        for each in input_filename_list:
            result = add_serial_metadata(each)
            if result is None:
                log_comment = 'We tried. We httplib.BadStatusLined.'
                runtime_errors.append(each)
            else:
                input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match = result
                if discogs_match:
                    matches_count += 1
            if result or tries > 3:
                # Log track data
                print log_comment
                artist = artist.translate(None, ',') if artist else ''
                album = album.translate(None, ',') if album else ''
                logging.info('%s,%s,%s,%s,%s', input_filename, artist, album, track_lengths_correlation, log_comment)
        tries += 1

    # Now the items with manually-named track titles from resplit_track_lists.csv
    if input_resplit_list:
        with open(resplit_track_lists_path, 'r') as resplit_track_lists:
            config.split_folder_list = os.listdir(config.split_folder)
            server_directory_list = os.listdir(server_split_path)
            rowdata = [row for row in csv.reader(resplit_track_lists)]
            for row in rowdata:
                query = [''.join(x) for _, x in itertools.groupby(row[0], key=str.isdigit)]
                query = ''.join([query[0].zfill(5)] + query[1:])
                # Check for filename query matches
                filename_matches = [os.path.join(config.split_folder, each) for each in config.split_folder_list if each.startswith(query)]
                if not filename_matches:
                    filename_matches = [os.path.join(server_split_path, each) for each in server_directory_list if each.startswith(query)]
                if not filename_matches:
                    print 'No split (_xx) audio file matches for resplit serial {0}! Dork.'.format(row[0])
                    continue
                # Write tags to FLAC files
                tracklist = [col for col in row[1:] if col]
                for position, (track, match) in enumerate(zip(tracklist, filename_matches), 1):
                    audio = FLAC(os.path.join(config.split_folder, match))
                    audio['tracknumber'] = str(position)
                    audio['title'] = track.decode('unicode-escape')
                    flac_length = time.strftime('%M:%S', time.gmtime(audio.info.length)).lstrip('0')
                    print position, flac_length, track, '-->', match
                    audio.save()
                matches_count += 1

    # Done? Great!
    accuracy = int(round(matches_count / float(files_count) * 100))
    processing_time = int(round(time.time() - start_time))
    print 'Great success! {0} files ({1} matches, {2}%) processed in {3}s.'.format(files_count, matches_count, accuracy, processing_time)
Ejemplo n.º 14
0
                pass
                #print 'No cover found for item ', mp3_fname
        else:
            region = wave.open(region_name, 'w')
            region.setparams(ifile_params)
            region.writeframes(ifile.readframes(region_length))
            region.close()
    ifile.close()


if __name__ == '__main__':
    # Make input filename list
    logging.basicConfig(level=config.log_level)

    if config.input_resplit_list:
        utils.export_csv(['re-split_list', 'resplit-track-lists'])
        with open(config.resplit_list_path, 'r') as resplit_list:
            rows = csv.reader(resplit_list)
            for row in rows:
                '''
                numeric_length = 50
                for i, c in enumerate(row[0]):
                    if not c.isdigit():
                        numeric_length = i
                        break
                #print row[0][:numeric_length].zfill(5) + row[0][numeric_length:] + '_clean.wav'
                input_filename_list.append(row[0][:numeric_length].zfill(5) + row[0][numeric_length:] + '_clean.wav')
                '''
                print row[0]
                input_filename_list.append(row[0] + '_clean.wav')
        # Remove bad split files from Julius
Ejemplo n.º 15
0
                if not 'pool' in page_data:
                    page_data['pool'] = pool

                # add the page to the file...
                if not page_wbs in relationships[file_wbs]['pages']:
                    relationships[file_wbs]['pages'][page_wbs] = page_data

    else:
        print('no files found in saved data! \n\tPath: {}'.format(data['folders']))

    #print('{}'.format(json.dumps(relationships, indent=4)))   
    return object_types

pickle_file = 'visio_data.pickle'
data = get_pickle_data( pickle_file)
object_types = summarize_data(data)
#print(object_types)
obj_model = {
"fileGUID":None, "filename":None, "title":None, "creator":None, "pageGUID":None, "pagename":None, "objectype":None, "shapeGUID":None, "shapeID":None, "shapeName":None, "shapeType":None, "shapeText":None, "shapeCallouts":None, "shapeConnects":None, "shapeConnected":None, "shapeContain":None
}

#logger.info('Found {} object types'.format(len(object_types)))

#print_items(obj=object_types)

rows = []
for k,v in object_types.items():
    rows.extend(v)

export_csv(fields=obj_model, data=rows)
Ejemplo n.º 16
0
def main(input_resplit_list=False):
    def add_serial_metadata(input_filename):
        def discogs_auth():
            authorize_token = None
            while not authorize_token:
                discogs = discogs_client.Client(CLIENT_NAME)
                discogs.set_consumer_key(CONSUMER_KEY, CONSUMER_SECRET)
                authorize_url = discogs.get_authorize_url()
                webbrowser.open(authorize_url[2])
                authorize_token = raw_input("Enter authorize token (Or q to quit): ")
            if authorize_token.lower() == "q":
                return "q"
            access_token = discogs.get_access_token(authorize_token)
            logging.debug("access_token: {0}".format(access_token))
            return discogs

        def average(x):
            assert len(x) > 0
            return float(sum(x)) / len(x)

        def pearson_def(x, y):
            """
            Correlation between listed Discogs release durations and split FLAC file durations.
            Would divide by zero if all durations are equal for x or y; set to return 1 instead.
            """
            assert len(x) == len(y)
            n = len(x)
            assert n > 0
            if n == 2:  # Pearson's is useless for n < 3
                return (min(x[0], y[0]) / float(max(x[0], y[0]))) * (min(x[1], y[1]) / float(max(x[1], y[1])))
            avg_x = average(x)
            avg_y = average(y)
            diffprod = 0
            xdiff2 = 0
            ydiff2 = 0
            for idx in range(n):
                xdiff = x[idx] - avg_x
                ydiff = y[idx] - avg_y
                diffprod += xdiff * ydiff
                xdiff2 += xdiff * xdiff
                ydiff2 += ydiff * ydiff
            return 1 if xdiff2 == 0 or xdiff2 == 0 else diffprod / math.sqrt(xdiff2 * ydiff2)

        def to_seconds(time_str_input):
            to_seconds_output = []
            if type(time_str_input) is list:
                for i in time_str_input:
                    i_split = i.split(":")
                    to_seconds_output.append(int(i_split[0] or 0) * 60 + int(i_split[1]))
                return to_seconds_output
            elif type(time_str_input) is str or unicode:
                if ":" not in time_str_input:
                    return int(time_str_input)
                i_split = time_str_input.split(":")
                return int(i_split[0] or 0) * 60 + int(i_split[1])
            elif type(time_str_input) is int:
                return time_str_input
            else:
                raise Exception("Invalid to_seconds input type: {0}".format(type(time_str_input)))

        def boxes_pull(input_filename):
            # Pull artist, album info from boxes csv
            double_type = None
            input_filename = input_filename.lstrip("0")
            input_filename = input_filename.rsplit("-")[0]
            with open(boxes_path, "r") as boxes:
                spamreader = csv.reader(boxes)
                rowdata = []
                for row in spamreader:
                    rowdata.append(row)
                for row in rowdata:
                    row_serial = row[0].translate(None, " ").lower()
                    if row_serial == input_filename:
                        print "Match!"
                        print "RealRow:", row
                        artist = row[1]
                        album = row[2]
                        if album.lower() == "self titled" or album.lower() == "self-titled":
                            album = artist
                        if row[5].lower() == "x":
                            print "Double trouble!"
                            if row[6].startswith("1/2") or row[6] == "":
                                print "1/2!"
                                double_type = "1/2"
                            elif row[6].startswith("1/4"):
                                print "Eeeek! 1/4!!"
                                double_type = "1/4"
                        elif row[5].lower() and row[5].lower() != "x":
                            double_type = "other"
                        if "live" in row[4].lower():
                            print "Eeeek! Live album!"
                        return artist, album, double_type
                else:
                    return None, None, None

        filename_matches = []
        log_comment = ""
        artist = ""
        album = ""
        track_lengths_correlation = 0
        discogs_match = False
        resplit_serial = False
        requested_serial = False

        print
        print "----------------------"
        print "Query:", input_filename

        # Check for filename query matches
        for each_file in os.listdir(flac_directory):
            if each_file.startswith(input_filename):
                # Skip 'a' matches for non-'a' files
                if len(input_filename) == 5 and each_file[5] != "_":
                    continue
                filename_matches.append(each_file)
        if not filename_matches:
            print "No split (_xx) audio file matches! Dork."
            return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match

        # Check for artist tags, add from spreadsheet if enabled and necessary
        if use_boxes_csv:
            artist, album, double_type = boxes_pull(input_filename)
            if (artist, album) == (None, None):
                log_comment = "No spreadsheet match found."
                return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
            elif double_type == "other":
                log_comment = "Freaky double! Engage manual tagging mode."
                if not input_resplit_list:
                    return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
            # Add tracknumber, album, artist to FLAC files
            for each_file in filename_matches:
                audio = FLAC(os.path.join(flac_directory, each_file))
                audio["artist"] = artist
                audio["album"] = album
                audio.save()
        else:
            try:
                artist = FLAC(os.path.join(flac_directory, filename_matches[0]))["artist"][0]
                album = FLAC(os.path.join(flac_directory, filename_matches[0]))["album"][0]
                print "Metadata found. Artist:", artist, "Album:", album
            except:
                print "No artist/album tags; boxes spreadsheet not searched:", input_filename
                return input_filename, artist, album, track_lengths_correlation, log_comment

        discogs_search = True
        if input_resplit_list:
            # Check re-split_list.csv for specified Discogs release ID
            with open(resplit_list_path, "r") as resplit_list:
                rowdata = [row for row in csv.reader(resplit_list)]
                for row in rowdata:
                    row_serial = row[0].translate(None, " ").lower()
                    if row_serial.startswith(input_filename.lstrip("0")):
                        if "csv" in row[1].lower():
                            print "Skipped: track titles in resplit-track-lists.csv"
                            return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
                        elif row[1].isdigit():
                            print "Requested serial from resplit list: {0}".format(row[1])
                            resplit_serial = True
                            output = [discogs.release(int(row[1]))]
                            discogs_search = False
                            break
                        elif row[1]:
                            print "Re-split serial field non-numeric!"
        if discogs_search:
            query = "".join(ch for ch in (artist + " " + album) if ch.isalnum() or ch in " -/,'")
            # Search Discogs for release ID/artist + album
            output = discogs.search(query, type="release")
            print 'Discogs results for "{0}":'.format(query)

        # Print track listing, tag FLAC files with titles
        accept_blank_tracklist_durations = False
        for search_loop in range(2):
            if accept_blank_tracklist_durations and requested_serial:
                output = discogs.search(query, type="release")
            time.sleep(discogs_request_interval)  # Wait to comply with Discogs 60 requests/minute limit
            try:
                for i, result in enumerate(output):
                    if i >= max_search_tries:
                        accept_blank_tracklist_durations = True
                        print "No results with track times found! Let's get a bit fuzzier..."
                        break
                    time.sleep(discogs_request_interval)  # Wait to comply with Discogs 60 requests/minute limit
                    if hasattr(result, "tracklist"):  # and type(result) != discogs_client.Master:
                        if resplit_serial or accept_blank_tracklist_durations:
                            tracklist = [track for track in result.tracklist]
                        else:
                            tracklist = [
                                track for track in result.tracklist if track.duration and to_seconds(track.duration) > 0
                            ]
                        if not tracklist:
                            print "Result {0}: No tracks found. Continuing...".format(str(i + 1))
                            continue
                        elif len(tracklist) != len(result.tracklist):
                            print "Result {0}: Some tracks missing durations. Be careful!".format(str(i + 1))
                        elif len(tracklist) == len(filename_matches):
                            print "----------------------"
                            print "Result", str(i + 1)
                            print "Release ID:", result.data["id"]
                            print "Artist:", result.artists[0].name.encode("utf-8")
                            print "Album:", result.title.encode("utf-8")

                            # Funky double handling
                            if double_type in ["1/4", "1/3"] and filename_matches[0][-7] not in "abcd":
                                if not result.tracklist[0].position:
                                    print "Result {0}: Nope! (No position info found in Discogs)".format(str(i + 1))
                                    continue
                                if not result.tracklist[0].position[0].isalpha():
                                    print "Result {0}: Nope! (No side info in Discogs positions)".format(str(i + 1))
                                    continue
                                if double_type == "1/4":
                                    sort_key = ["a", "d", "b", "c"]
                                if double_type == "1/3":
                                    sort_key = ["a", "c", "b", "d"]
                                # Reorder tracklist by alpha position key
                                tracklist_sorted = []
                                for key in sort_key:
                                    for track in tracklist:
                                        if track.position.lower().startswith(key):
                                            tracklist_sorted.append(track)
                                tracklist = tracklist_sorted

                            if tracklist[0].duration and not resplit_serial:
                                # Check correlation of Discogs track lengths with those of FLAC files
                                discogs_lengths = [track.duration for track in tracklist if track.duration]
                                flac_lengths = []
                                for match in filename_matches:
                                    audio = FLAC(os.path.join(flac_directory, match))
                                    flac_length = time.strftime("%M:%S", time.gmtime(audio.info.length)).lstrip("0")
                                    flac_lengths.append(flac_length)
                                track_lengths_correlation = round(
                                    pearson_def(to_seconds(discogs_lengths), to_seconds(flac_lengths)), 4
                                )
                                print "Track lengths correlation:", track_lengths_correlation
                                if track_lengths_correlation < min_correlation:
                                    print "Result {0}: Low correlation. Best check yoself!".format(str(i + 1))
                                    continue

                            # Write tags to FLAC files
                            discogs_match = True
                            for track, match in zip(tracklist, filename_matches):
                                audio = FLAC(os.path.join(flac_directory, match))
                                audio["tracknumber"] = track.position
                                audio["title"] = track.title
                                if "artists" in track.data:
                                    # Add artist info to compilation album tracks
                                    audio["artist"] = track.data["artists"][0]["name"].split("(")[0].strip()
                                flac_length = time.strftime("%M:%S", time.gmtime(audio.info.length)).lstrip("0")
                                print track.position, track.title.encode(
                                    "utf-8"
                                ), track.duration, "-->", match, flac_length
                                audio.save()
                            return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
                        else:
                            print "Result " + str(i + 1) + ": Nope! (" + str(len(tracklist)) + " != " + str(
                                len(filename_matches)
                            ) + ")"
                else:
                    if len(output) == 0:  # Don't retry fuzzy-style if there weren't any Discogs query matches
                        log_comment = "No matches for Discogs query!"
                        return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match
                    elif search_loop == 0:
                        accept_blank_tracklist_durations = True
                        print "No results with track times found! Let's get a bit fuzzier..."
            except Exception as e:  # httplib.BadStatusLine?
                print "httplib.BadStatusLine (?) error:", e
                return None
        else:
            log_comment = "No proper Discogs matches! Dork."
        return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match

    # Initial variables
    config = ConfigParser()
    config.read(CONFIG_LOCATION)

    input_filename_series = config.getboolean("discogs", "input_filename_series")
    input_whole_folder = config.getboolean("discogs", "input_whole_folder")
    use_boxes_csv = config.getboolean("discogs", "use_boxes_csv")
    input_filenames = config.get("discogs", "input_filenames")
    input_filename_list_start = config.getint("discogs", "input_filename_list_start")
    input_filename_list_end = config.getint("discogs", "input_filename_list_end")
    flac_directory = config.get("discogs", "flac_directory")
    boxes_path = config.get("general", "boxes_path")
    log_to_file = config.getboolean("discogs", "log_to_file")
    log_path = config.get("general", "log_path")
    discogs_request_interval = config.getfloat("discogs", "discogs_request_interval")  # seconds between API requests
    resplit_list_path = config.get("general", "resplit_list_path")
    min_correlation = config.getfloat("discogs", "min_correlation")
    max_search_tries = config.getint("discogs", "max_search_tries")
    max_error_tries = config.getint("discogs", "max_error_tries")
    resplit_track_lists_path = config.get("general", "resplit_track_lists_path")
    input_resplit_list = config.getboolean("split", "input_resplit_list")
    server_split_path = config.get("general", "server_split_path")

    input_filename_list_range = range(input_filename_list_start, input_filename_list_end)
    input_filename_list = []
    matches_count = 0
    start_time = time.time()
    discogs_logged_in = False
    tries = 0
    runtime_errors = []

    # Logging setup
    if log_to_file:
        logging.basicConfig(
            filename=os.path.join(log_path, "log.csv"), format="%(levelname)s,%(message)s", level=logging.INFO
        )
    else:
        logging.basicConfig(format="%(levelname)s,%(message)s", level=logging.DEBUG)

    # Log into Discogs
    if not DISCOGS_MANUAL_AUTH:
        discogs = discogs_client.Client(CLIENT_NAME, CONSUMER_KEY, CONSUMER_SECRET, TOKEN, SECRET)
    else:
        while not discogs_logged_in:
            try:
                discogs = discogs_auth()
                if discogs == "q":
                    sys.exit()
                discogs_logged_in = True
            except:
                print "Nope! Try again."

    # Collect, print input filenames
    if input_filename_series:
        for f in input_filename_list_range:
            input_filename_list.append(str(f).zfill(5))
        print "Input filenames (series):", input_filename_list
    elif input_whole_folder:
        for f in os.listdir(flac_directory):
            if f.endswith(".flac") or f.endswith(".mp3"):
                f = f.rsplit("_clean")[0]
                if f[-2] == "-":
                    f = f[:-1]
                if f not in input_filename_list:
                    input_filename_list.append(f)
        print "Input filenames (" + flac_directory + "):", input_filename_list
    elif type(input_filenames) is not list:
        input_filename_list = input_filenames.split(",")
    else:
        input_filename_list = input_filenames
    files_count = len(input_filename_list)

    # Grab spreadsheet from Google Drive
    utils.export_csv("Boxes!")

    # Do the things
    while tries == 0 or runtime_errors and tries <= max_error_tries:
        if runtime_errors:
            print "Runtime errors:", runtime_errors
            print "{0} runtime errors found! See above.".format(len(runtime_errors))
            input_filename_list = runtime_errors
            runtime_errors = []
        for each in input_filename_list:
            result = add_serial_metadata(each)
            if result is None:
                log_comment = "We tried. We httplib.BadStatusLined."
                runtime_errors.append(each)
            else:
                input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match = result
                if discogs_match:
                    matches_count += 1
            if result or tries > 3:
                # Log track data
                print log_comment
                artist = artist.translate(None, ",") if artist else ""
                album = album.translate(None, ",") if album else ""
                logging.info("%s,%s,%s,%s,%s", input_filename, artist, album, track_lengths_correlation, log_comment)
        tries += 1

    # Now the items with manually-named track titles from resplit_track_lists.csv
    if input_resplit_list:
        with open(resplit_track_lists_path, "r") as resplit_track_lists:
            flac_directory_list = os.listdir(flac_directory)
            server_directory_list = os.listdir(server_split_path)
            rowdata = [row for row in csv.reader(resplit_track_lists)]
            for row in rowdata:
                query = ["".join(x) for _, x in itertools.groupby(row[0], key=str.isdigit)]
                query = "".join([query[0].zfill(5)] + query[1:])
                # Check for filename query matches
                filename_matches = [
                    os.path.join(flac_directory, each) for each in flac_directory_list if each.startswith(query)
                ]
                if not filename_matches:
                    filename_matches = [
                        os.path.join(server_split_path, each)
                        for each in server_directory_list
                        if each.startswith(query)
                    ]
                if not filename_matches:
                    print "No split (_xx) audio file matches for resplit serial {0}! Dork.".format(row[0])
                    continue
                # Write tags to FLAC files
                tracklist = [col for col in row[1:] if col]
                for position, (track, match) in enumerate(zip(tracklist, filename_matches), 1):
                    audio = FLAC(os.path.join(flac_directory, match))
                    audio["tracknumber"] = str(position)
                    audio["title"] = track.decode("unicode-escape")
                    flac_length = time.strftime("%M:%S", time.gmtime(audio.info.length)).lstrip("0")
                    print position, flac_length, track, "-->", match
                    audio.save()
                matches_count += 1

    # Done? Great!
    accuracy = int(round(matches_count / float(files_count) * 100))
    processing_time = int(round(time.time() - start_time))
    print "Great success! {0} files ({1} matches, {2}%) processed in {3}s.".format(
        files_count, matches_count, accuracy, processing_time
    )
Ejemplo n.º 17
0
def export_product():
    return send_file(utils.export_csv())
Ejemplo n.º 18
0
 def create_csv(self):
     self.get_data()
     utils.export_csv(self.header, self.body, 'vultr.csv')
def export_classification(years, kwLimit, min_edge_th, dispth, ethunit):
    resdir = 'classification/classification_window' + str(
        int(years[len(years) - 1]) - int(years[0]) + 1) + '_kwLimit' + str(
            int(kwLimit)) + '_dispth' + str(dispth) + '_ethunit' + str(ethunit)
    try:
        os.makedirs(resdir)
    except:
        print("res dir exists")

    print("Constructing patent probas for years " + str(years))

    mongo = pymongo.MongoClient(utils.get_parameter('mongopath', True, True))
    # load keywords
    patents = mongo['patent']['keywords'].find({"app_year": {
        "$in": years
    }},
                                               no_cursor_timeout=True)
    npatents = patents.count()
    yearrange = years[0] + "-" + years[len(years) - 1]
    # load graph and construct communities
    [graph, coms] = pickle.load(
        open(
            'pickled/filteredgraphcoms_' + yearrange + '_' + str(kwLimit) +
            '_eth' + str(min_edge_th) + '_dispth' + str(dispth) + '_ethunit' +
            str(ethunit) + '.pkl', 'rb'))
    # best clustering in com[len(com)-1]
    clustering = coms[len(coms) - 1]

    #construct dico kw -> community
    dico = {}
    for n in range(graph.vcount()):
        name = graph.vs['name'][n]
        dico[name] = clustering.membership[n]

    ncommunities = len(clustering.sizes())
    probas = []
    rownames = []
    counts = []

    i = 0
    for currentpatent in patents:
        if i % 10000 == 0: print('probas : ' + str(100 * i / npatents))
        #currentpatent = patents.next()
        currentprobas = [0.0] * ncommunities
        for kw in currentpatent['keywords']:
            if kw in dico:
                currentprobas[dico[kw]] = currentprobas[dico[kw]] + 1
            nk = len(currentpatent['keywords'])
        if sum(currentprobas) > 0:
            probas.append(currentprobas)
            rownames.append(currentpatent['id'])
            counts.append(nk)
        i = i + 1

    # export the matrix proba as csv
    utils.export_matrix_sparse_csv(
        probas, [rownames, counts],
        resdir + '/probas_' + yearrange + '_kwLimit' + str(kwLimit) +
        '_dispth' + str(dispth) + '_ethunit' + str(ethunit) + '.csv', ";")

    # add attributes to keywords
    degree = graph.degree(range(graph.vcount()))
    evcentrality = graph.eigenvector_centrality(weights='weight')
    bcentrality = graph.betweenness(weights='weight')
    ccentrality = graph.closeness(weights='weight')
    weighteddegree = graph.strength(range(graph.vcount()), weights='weight')

    kwattrsdico = {}
    for n in range(graph.vcount()):
        kwattrsdico[graph.vs['name'][n]] = [
            graph.vs['tidf'][n], graph.vs['disp'][n], graph.vs['docfreq'][n],
            graph.vs['termhood'][n], degree[n], weighteddegree[n],
            bcentrality[n], ccentrality[n], evcentrality[n]
        ]

    kwdata = []
    for currentkw in dico.keys():
        if currentkw in kwattrsdico:
            kwdata.append([currentkw, dico[currentkw]] +
                          kwattrsdico[currentkw])

    # export keywords as csv
    utils.export_csv(
        kwdata, resdir + '/keywords_' + yearrange + '_kwLimit' + str(kwLimit) +
        '_dispth' + str(dispth) + '_ethunit' + str(ethunit) + '.csv', ';',
        'keyword;community;tidf;technodispersion;docfreq;termhood;degree;weighteddegree;betweennesscentrality;closenesscentrality;eigenvectorcentrality'
    )

    # Patent measures
    patents = mongo['patent']['keywords'].find({"app_year": {
        "$in": years
    }},
                                               no_cursor_timeout=True)
    measures = []
    nmeasures = len(kwattrsdico[graph.vs['name'][0]])
    i = 0
    for currentpatent in patents:
        #if i%10000==0 : print('patent measures : '+str(100*i/npatents))
        print('patent measures : ' + str(100 * i / npatents))
        currentmeasures = [0.0] * nmeasures
        kwnum = 0
        for kw in currentpatent['keywords']:
            if kw in kwattrsdico:
                currentmeasures = [
                    currentmeasures[i] + kwattrsdico[kw][i]
                    for i in range(len(currentmeasures))
                ]
                kwnum = kwnum + 1
        nk = len(currentpatent['keywords'])
        if sum(currentmeasures) != 0:
            measures.append([currentpatent['id'], nk, kwnum] + currentmeasures)
        i = i + 1

    # export measures

    utils.export_csv(
        measures, resdir + '/patent_' + yearrange + '_kwLimit' + str(kwLimit) +
        '_dispth' + str(dispth) + '_ethunit' + str(ethunit) + '.csv', ';',
        'patent;kws;classkws;tidf;technodispersion;docfreq;termhood;degree;weighteddegree;betweennesscentrality;closenesscentrality;eigenvectorcentrality'
    )
Ejemplo n.º 20
0
 def create_csv(self):
     self.get_data()
     utils.export_csv(self.header, self.body, 'digitalocean.csv')
Ejemplo n.º 21
0
browser_arr = [browser, USER_ID, PASS_WORD]
browser_arr_2 = [browser_2, USER_ID_2, PASS_WORD_2]
browsers = [browser_arr, browser_arr_2]

if __name__ == '__main__':
    for browser_param in browsers:
        utils.login(browser_param[1], browser_param[2], browser_param[0])
        utils.set_wait_time(TIME_TO_WAIT, browser_param[0])
        utils.check_current_url(browser_param[0])

    #csvが存在していなかったら、urlを全て配列に格納し、csvとしてエクスポートする
    if os.path.exists(URL_PATH) == False:
        utils.move_to_company_list(browsers[0][0])
        url_arr = utils.get_url(NUMBER_OF_COMPANY, browsers[0][0])
        utils.export_csv(url_arr, URL_PATH)
        utils.browser_close(browsers[0][0])
    else:
        #csvが存在していたら、csvを読み込んでurl_arrに格納する
        url_arr = utils.import_csv(URL_PATH)

    #DB接続
    connector = MySQLdb.connect(
        unix_socket = DB_UNIX_SOCKET,
        host=DB_HOST, user=DB_USER, passwd=DB_PASS_WORD, db=DB_NAME
    )
    corsor = connector.cursor()

    #ブラウザの数だけURLの配列を分割する
    url_arrs = list(np.array_split(url_arr, NUMBER_OF_BROWSERS))
    print(len(url_arrs[0]))
Ejemplo n.º 22
0
import os
from getFiles import scan_folders
from utils import export_csv

folders = ['H:\\', 'IT', 'Projects', '2018', 'IT2018051 HRIS']
folder = os.path.join(*folders)

print('Scanning...\n\t{} -> {}'.format(folder, folders[-1]))
data = scan_folders(folder, folders[-1])

if len(data.get('files', [])) > 0:
    export_csv(data=data.get('files', []), filename='filelist.csv')