def extract_comments(filename, top_n_subreddits):
    # reading a bz2 archive

    with bzopen(filename, "r") as bzfin:
        """ Handle lines here """
        cbz = codecs.iterdecode(bzfin, "utf-8")

        reddit_submissions = []
        for i, line in enumerate(cbz):

            try:
                post_dict = json.loads(line.rstrip())
                '''
                {
                    "created_utc":1506816000,
                    "link_id":"t3_73ieyz"
                }
                '''
                post_id, comment_date, subreddit_id = post_dict[
                    'link_id'], post_dict['created_utc'], post_dict[
                        'subreddit_id']

                if i % 100000 == 0:
                    print(i, post_id, comment_date, subreddit_id)

                if subreddit_id in top_n_subreddits:
                    subreddits_noComments[subreddit_id] += 1

            except:
                continue

    return reddit_submissions
Beispiel #2
0
 def update_relationships(self, year, month):
     fname = os.path.join(AS_REL_DATA_DIR,
                          f'{year}{month}01.as-rel.txt.bz2')
     if not os.path.exists(fname):
         return
     with bzopen(fname, 'r') as f:
         for i, line in enumerate(f):
             line = line.decode('utf-8').strip()
             if line.startswith('#'):
                 continue
             else:
                 l = line.split('|')
                 as1 = int(l[0])
                 as2 = int(l[1])
                 relationship_type = int(l[2])
                 if relationship_type == -1:
                     r = 'c2p'
                 elif relationship_type == 0:
                     r = 'p2p'
                 else:
                     raise Exception('Invalid relationship type!')
                 if (as1, as2) not in self.G.edges:
                     self.G.add_edge(as1,
                                     as2,
                                     label=r,
                                     timestamp=datetime.date(
                                         year=int(year),
                                         month=int(month),
                                         day=1))
Beispiel #3
0
def open_file_by_mimetype(filename, mode):
    """
    This function determines the compression MIME type of a file as gz, bz, or none, and returns
    an open file handle of the requested mode ('w', 'r', or 'a')
    """

    if mode != 'r' and mode != 'w' and mode != 'a':
        print("please specific a valid mode:  w, r, a")
        return

    if guess_type(filename)[1] == 'gzip':
        try:
            fh = gzopen(filename, mode)
        except Exception as error:
            print("Error opening file ", filename, ": ", error)
            return
    elif guess_type(filename) == 'bzip2':
        try:
            fh = bzopen(filename, mode)
        except Exception as error:
            print("Error opening file ", filename, ": ", error)
            return
    else:
        try:
            fh = open(filename, mode)
        except Exception as error:
            print("Error opening file ", filename, ": ", error)
            return

    return fh
Beispiel #4
0
def extract_comments(filename, top_n_subreddits):
    post_commentTimes = defaultdict(list)
    # reading a bz2 archive

    with bzopen(filename, "r") as bzfin:
        """ Handle lines here """
        cbz = codecs.iterdecode(bzfin, "utf-8")
        # print(len(list(cbz)))
        for i, line in enumerate(cbz):
            if i % 10000000 == 0:
                print(i)

            try:
                post_dict = json.loads(line.rstrip())

                #
                # {
                #    "created_utc":1506816000,
                #    "link_id":"t3_73ieyz"
                # }

                post_id, comment_date, subreddit_id, comment_body = post_dict[
                    'link_id'], post_dict['created_utc'], post_dict[
                        'subreddit_id'], post_dict['body']

                if subreddit_id in top_n_subreddits and post_id in user_post_ids:
                    #print(comment_date, comment_body)
                    post_commentTimes[post_id].append(
                        [comment_date, comment_body])

            except:
                continue

    return post_commentTimes
def main():
    # <http://www.wikidata.org/entity/Q47133351> <http://www.wikidata.org/prop/direct/P356> "10.1002/EJP.1050" .
    REGEX = r'^<http:\/\/www\.wikidata\.org\/entity\/(Q\d+)> <http:\/\/www.wikidata.org\/prop\/direct\/(P\d+)> "(.*?)" \.$'
    manifest = ['P356', 'P698', 'P932', 'P2880']
    dump_location = '/public/dumps/public/wikidatawiki/entities/latest-truthy.nt.bz2'
    to_add = {x: [] for x in manifest}

    with bzopen(dump_location, 'r') as f:
        for line in f:
            line = line.decode('utf-8')
            match = re.match(REGEX, line)
            if match is None:
                continue

            wd_item  = match.group(1)
            wd_prop  = match.group(2)
            wd_value = match.group(3)

            if wd_prop in manifest:
                #print('Up to', wd_item, end='\r')
                to_add[wd_prop].append((wd_item, wd_value))

                if len(to_add[wd_prop]) >= 10000:
                    print('\nSaving to Redis')

                    wikidata_to_x = {x[0]: x[1] for x in to_add[wd_prop]}
                    x_to_wikidata = {x[1]: x[0] for x in to_add[wd_prop]}

                    REDIS.hmset(
                        '{0}_to_wikidata_{1}'.format(wd_prop, today),
                        x_to_wikidata)
                    REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today),
                        wikidata_to_x)

                    to_add[wd_prop] = []

    # If there are leftovers
    for wd_prop, tuplelist in to_add.items():
        wikidata_to_x = {x[0]: x[1] for x in tuplelist}
        x_to_wikidata = {x[1]: x[0] for x in tuplelist}

        REDIS.hmset(
            '{0}_to_wikidata_{1}'.format(wd_prop, today),
            x_to_wikidata)
        REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today),
            wikidata_to_x)

    # Finalize
    for wd_prop in manifest:
        REDIS.rename('{0}_to_wikidata_{1}'.format(wd_prop, today),
            '{0}_to_wikidata'.format(wd_prop))
        REDIS.rename('wikidata_to_{0}_{1}'.format(wd_prop, today),
            'wikidata_to_{0}'.format(wd_prop))
def main():
    call_ctr = 0
    sms_ctr = 0
    for day in days_of_the_year:
        print("Processing day:", day)
        commsfile = file_start + day + file_end
        with bzopen(commsfile) as infile:
            print(infile.readline())
            for entry in infile:
                hour, tid, user, call, sms = entry.strip().split(',')
                call_ctr += int(call)
                sms_ctr += int(sms)
    print("Total comms for the year, calls:", call_ctr, "sms:", sms_ctr)
def main():
    options = opt_get()
    in_file = options.in_file
    out_file = options.out_file
    n = int(options.n)
    compression_type = options.compression_type

    if compression_type not in ["none", "b", "g"]:
        sys.exit(
            "Wrong compression type used: valid options are 'none', 'b' and 'g'."
        )

    if compression_type == "b":
        from bz2 import BZ2File as bzopen
        fh = bzopen(in_file, 'r')
    if compression_type == "g":
        import gzip
        fh = gzip.open(in_file, 'r')
    else:
        fh = open(in_file, 'r')

    out = open(out_file, 'w')

    iters = 0
    lost = 0
    while True:
        fq_data = fq_read(fh)

        if fq_data == "none":
            print "Processed " + str(iters) + " reads."
            print "Removed " + str(lost) + " reads (" + str(
                float(lost) / float(iters) * 100) + "%)"
            break

        iters += 1
        cut_place = pA_find(fq_data['seq'])
        if cut_place == 0 or len(fq_data['seq'][:cut_place]) < n:
            lost += 1
            continue

        out.write(fq_data['header'])
        out.write(fq_data['seq'][:cut_place] + "\n")
        out.write(fq_data['spacer'])
        out.write(fq_data['qual'][:cut_place] + "\n")

        if iters % 1000 == 0:
            sys.stdout.write('%s\r' % iters)
        sys.stdout.flush()
def main():
    with bzopen('assets/crossref_references.jsonl.bz2', 'r') as f:
        for line in f:
            print('. ', end='', flush=True)
            while threading.active_count() >= THREAD_LIMIT:
                time.sleep(0.25)

            mapping = json.loads(line)
            doi_x = list(mapping.keys())[0]
            lookup = [doi_x]

            for doi_y in mapping[doi_x]:
                lookup.append(doi_y)

            t = threading.Thread(target=process_bundle, args=(lookup, doi_x))
            t.daemon = True
            t.start()
Beispiel #9
0
def extract_submissions(filename, top_n_subreddits):
    # reading a bz2 archive

    with bzopen(filename, "r") as bzfin:
        """ Handle lines here """
        cbz = codecs.iterdecode(bzfin, "utf-8")

        reddit_submissions = []
        for i, line in enumerate(cbz):
            try:
                post_dict = json.loads(line.rstrip())
                #print(post_dict)

                '''
                {
                'num_comments': 0,
                'author': 'magicks',
                'name': 't3_eut41',
                'subreddit_id': 't5_2qh0u',
                'created_utc': 1293944394,
                }

                n_topics, n_posts
                100, 220304
                200, 255879
                500, 306173
                1000, 338408

                '''
                # score = upvotes - downvotes
                post_id, subreddit_id, user, post_date, num_comments, score  = post_dict['name'], post_dict['subreddit_id'],\
                                                                       post_dict['author'], post_dict['created_utc'],\
                                                                       post_dict['num_comments'], post_dict['score']

                #print(post_id, num_comments, score)
                if subreddit_id in top_n_subreddits and user !='[deleted]':
                    reddit_submissions.append([post_id, subreddit_id, user, post_date, num_comments, score])
                    num_posts_per_subreddit[subreddit_id]+=1

            except:
                continue



    return reddit_submissions
Beispiel #10
0
def process_file(path, output_path):
    output_length = []
    with bzopen(path) as infile: # unzips the file
        filename = path
        print(filename)
        for i,line in enumerate(infile):
            try:
                tweet = json.loads(line)
            except json.JSONDecodeError as e:
                print(f'Error in {infile} line {i}: {e}')
            if not 'text' in tweet: # this jumps to the next line in the json if there's no text
                    print(f'Error in {infile} line {i}: No text element to read from')
                    continue
            else:
                text_field = tweet['text']
            if contains_regex(tweet=text_field, regex=hillary):
                #output.append(tweet)
                with open(output_path, 'a+') as outfile:
                    pos1 = json.dumps(tweet)
                    outfile.write(pos1 + '\n')
    
    print('Wrote tweets to json. Next file.')
Beispiel #11
0
def process_jsonlines_hotpotqa(filename):
    """
    This is process_jsonlines method for intro-only processed_wikipedia file.
    The item example:
    {"id": "45668011", "url": "https://en.wikipedia.org/wiki?curid=45668011", "title": "Flouch Roundabout",
     "text": ["Flouch Roundabout is a roundabout near Penistone, South Yorkshire, England, where the A628 meets the A616."],
     "charoffset": [[[0, 6],...]]
     "text_with_links" : ["Flouch Roundabout is a roundabout near <a href=\"Penistone\">Penistone</a>,
     <a href=\"South%20Yorkshire\">South Yorkshire</a>, England, where the <a href=\"A628%20road\">A628</a>
     meets the <a href=\"A616%20road\">A616</a>."],
        "charoffset_with_links": [[[0, 6], ... [213, 214]]]}
    """
    # item should be nested list
    extracted_items = []
    # with jsonlines.open(filename) as reader:
    with bzopen(filename, "r") as bzfin:
        for obj in jsonlines.Reader(bzfin):
            wiki_id = obj["id"]
            title = obj["title"]
            title_id = make_wiki_id(title, 0)
            plain_text = "\t".join(obj["text"])
            text_with_links = "\t".join(obj["text_with_links"])

            hyper_linked_titles = []
            hyper_linked_titles = find_hyper_linked_titles(text_with_links)
            if len(hyper_linked_titles) > 0:
                hyper_linked_titles_text = "\t".join(hyper_linked_titles)
            else:
                hyper_linked_titles_text = ""
            extracted_items.append({
                "wiki_id": wiki_id,
                "title": title_id,
                "plain_text": plain_text,
                "hyper_linked_titles": hyper_linked_titles_text,
                "original_title": title
            })

    return extracted_items
def extract_comments(filename, top_n_subreddits):
    # reading a bz2 archive

    with bzopen(filename, "r") as bzfin:
        """ Handle lines here """
        cbz = codecs.iterdecode(bzfin, "utf-8")
        #print(len(list(cbz)))

        reddit_submissions = []
        
        for i, line in enumerate(cbz):

            try:
                post_dict = json.loads(line.rstrip())

                #
                #{
                #    "created_utc":1506816000,
                #    "link_id":"t3_73ieyz"
                #}
                
                post_id, comment_date, subreddit_id = post_dict['link_id'], post_dict['created_utc'], post_dict['subreddit_id']

                #if i % 100000 == 0:
                #    print(i, post_id, comment_date, subreddit_id)

                if subreddit_id in top_n_subreddits:
                    post_commentTimes[post_id].append(comment_date)

        
            except:
                continue

        

    return reddit_submissions
Beispiel #13
0
 def __init__(self, path):
     self.file = bzopen(path)
filename_out = str(sys.argv[2])

min_token_len = 1

satzzeichen = ',.?!:;<>()/\{}#"\'´`‚’‘_→[]-~«»'

exclude_zeichen = '*/=→[]."'

exclude_sonstiges = ('\textit', '\t', '\xa0', '\small', '\sharp', '\markup',
                     '\concat', '\flat', '\override', '\translate', '\set',
                     '\new')

sen_num = 0
lines_dropped = 0

with bzopen(filename) as bzin, open(filename_out, 'w') as txt_out:
    for line in bzin:
        try:
            line = line.decode('utf-8')
            #        print(line)
            #if("<doc" in line or "</doc>" in line):
            #    continue
            try:
                line_json = json.loads(line)

                text = line_json['text']  #.replace('\n','')
            except:
                text = line

            for a in exclude_sonstiges:
                text = text.replace(a, ' ')
Beispiel #15
0
subreddits = {'subreddit': []}

for m in unique_months:
    threads[m] = []

    users[m + '_num_thread'] = []
    users[m + '_num_comment'] = []
    users[m + '_num_subreddit'] = []

    subreddits[m + '_num_thread'] = []
    subreddits[m + '_num_comment'] = []
    subreddits[m + '_num_user'] = []
    subreddits[m + '_num_unique_user'] = []

for f in comment_files:
    with bzopen(comment_dir + f, 'rb') as bzfin:
        month = f[:f.index('.')]
        print("\nStart working on month:", month)

        i = 0
        for line in bzfin.readlines():
            line = line.decode('utf-8')
            line = json.loads(line)

            if i % 10000 == 0:
                print(i, 'Start extracting attributes of COMMENTS (comments))')
            # comments attributes
            comment_id = line['id']
            comment = line['body']
            tokens = word_tokenize(comment)
            subreddit = line['subreddit']
def parser_target():
    global prefixes, attacks
    while True:
        try:
            now = dates.get(block=False, timeout=0.125)
        except:
            return

        filename = '../route-views/{}.csv.bz2'.format(now)
        printed = False
        while not os.path.isfile(filename):
            if not printed:
                print('{} waiting for parse to finish'.format(now))
                printed = True
            time.sleep(10)

        print('{} summarizing'.format(now))
        start_time = time.time()

        num_v4_prefixes = 0
        num_v6_prefixes = 0
        num_unique_v4_prefixes = 0
        num_unique_v6_prefixes = 0
        num_v4_isolated = 0
        num_v6_isolated = 0
        num_v4_distributed = 0
        num_v6_distributed = 0
        seen_prefixes = set()
        attacked_prefixes = set()

        with bzopen(filename, 'rt') as f:
            reader = csv.reader(f)
            next(reader)
            for line in reader:
                ip_type = line[2]
                prefix = line[3]

                if ip_type == '4':
                    num_v4_prefixes += 1
                elif ip_type == '6':
                    num_v6_prefixes += 1
                seen_prefixes.add(prefix)
                thread_safe_add(prefix, ip_type)

                if line[5].rstrip() == 'I':
                    if prefix not in attacked_prefixes:
                        if ip_type == '4':
                            num_v4_isolated += 1
                        elif ip_type == '6':
                            num_v6_isolated += 1
                    thread_safe_update(prefix, 2)
                    attacked_prefixes.add(prefix)
                elif line[5].rstrip() == 'D':
                    if prefix not in attacked_prefixes:
                        if ip_type == '4':
                            num_v4_distributed += 1
                        elif ip_type == '6':
                            num_v6_distributed += 1
                    thread_safe_update(prefix, get_num_actors(line[4]))
                    attacked_prefixes.add(prefix)

        for prefix in seen_prefixes:
            with prefixes_lock:
                if prefix in prefixes:
                    if '.' in prefix:
                        num_unique_v4_prefixes += 1
                    elif ':' in prefix:
                        num_unique_v6_prefixes += 1
        end_time = time.time()
        print('{0} -- {1:.3f} sec'.format(now, end_time - start_time))

        record = '{0},{1},{2},{3},{4},{5},{6},{7},{8}\n'.format(
            now, num_v4_prefixes, num_unique_v4_prefixes, num_v6_prefixes,
            num_unique_v6_prefixes, num_v4_isolated, num_v6_isolated,
            num_v4_distributed, num_v6_distributed)

        outbox.put(record)
        del attacked_prefixes
        del seen_prefixes
from bz2 import BZ2File as bzopen

all_filenames = !find data -name "*.bz2"

with open('mega_file.json', 'wb') as f:
    for file in all_filenames:
        stream = bzopen(file, 'r')
        for line in stream:
            f.write(line)
        stream.close()


Beispiel #18
0
def generate_header_row(input_files):
    tmp_conditions = []
    for input_file in input_files:
        ff = os.path.basename(input_file)
        userid = mapping[ff.split('+',1)[0]]
        count = 0
        tmpbuf = []
        with bzopen(input_file, "rb") as bzfin:
            """ Handle lines here """

            starttime = -1

            for i, line in enumerate(bzfin):
                ln = line.decode().rstrip()
                lsplits = ln.split(',',2)
                try:
                    ema_json = json.loads(lsplits[2][1:-1])
                    tmpbuf.append(ema_json)

                    #['status', 'current_time', 'timestamp', 'id', 'logSchedule', 'message', 'type', 'operation']
                except Exception as e:
                    print(e)
                    print(lsplits[2][1:-1])

                count += 1

        groupedbuf = []
        tmp = []
        srtts = -1
        ema_started = False
        for l in tmpbuf:
            if 'message' in l and l['message'] == 'true: datapoint not found':
                ema_started = True
                continue

            if ema_started:
                #if 'message' in l and l['message'] != 'false: some conditions are failed':
                #    if l['message'] != 'true: all conditions okay':
                tmp.append(l)

            if 'status' in l and (l['status'] == 'COMPLETED' or l['status'] == 'MISSED' or l['status'] == 'ABANDONED_BY_TIMEOUT'):
                ema_started = False
                groupedbuf.append(tmp)
                tmp = []

            if 'message' in l and l['message'] == 'false: some conditions are failed':
                ema_started = False
                if len(tmp):
                    groupedbuf.append(tmp)
                tmp = []

        tab = ','
        for x in groupedbuf:
            for cond in x:
                if 'status' in cond:
                    continue
                condition = cond['type'] + '-' + cond['id']
                if condition not in tmp_conditions:
                    tmp_conditions.append(condition)

    for cd in tmp_conditions:
        if 'VALID_BLOCK' in cd:
            conditions.append(cd)
    conditions.append('BLOCK')
    for cd in tmp_conditions:
        if 'VALID_BLOCK' not in cd:
            conditions.append(cd)
Beispiel #19
0
def parse_log(input_file):
    ff = os.path.basename(input_file)
    userid = mapping[ff.split('+',1)[0]]
    count = 0
    tmpbuf = []
    csvbuf = ''
    with bzopen(input_file, "rb") as bzfin:
        """ Handle lines here """

        starttime = -1

        for i, line in enumerate(bzfin):
            ln = line.decode().rstrip()
            lsplits = ln.split(',',2)
            try:
                ema_json = json.loads(lsplits[2][1:-1])
                tmpbuf.append(ema_json)

                #['status'hp, 'current_time', 'timestamp', 'id', 'logSchedule', 'message', 'type', 'operation']
            except Exception as e:
                print(e)
                print(lsplits[2][1:-1])

            count += 1

    groupedbuf = []
    tmp = []
    tmptmp = []
    srtts = -1
    ema_started = False
    for l in tmpbuf:
        #"type": "PRIVACY", "id": "PRIVACY"}
        #print(l)
        if 'type' in l and l['type'] == 'PRIVACY' and l['id'] == 'PRIVACY':
            #print('EMA_STARTED')
            ema_started = True
            if len(tmp):
                for aa in tmp:
                    tmptmp.append(aa)
                tmp = []
            #continue

        if ema_started:
            #if 'message' in l and l['message'] != 'false: some conditions are failed':
            #    if l['message'] != 'true: all conditions okay':
            tmp.append(l)

        if 'status' in l and (l['status'] == 'COMPLETED' or l['status'] == 'MISSED' or l['status'] == 'ABANDONED_BY_TIMEOUT'):
            #print('EMA_ENDED')
            ema_started = False
            if len(tmp):
                groupedbuf.append(tmp)
                tmp = []
            if len(tmptmp):
                tmptmp.append(l)
                groupedbuf.append(tmptmp)
                #print(tmptmp)
                tmptmp = []

        #if 'message' in l and l['message'] == 'false: some conditions are failed':
        if 'message' in l and 'false:' in l['message']:
            #print('EMA_ENDED_AAAAAAAAAAAAAAAAAAAA')
            ema_started = False
            if len(tmp):
                groupedbuf.append(tmp)
            tmp = []

    '''
    for x in groupedbuf:
        for y in x:
            print(y)
    '''

    tab = ','
    dup_list = []
    for x in groupedbuf:
        if not len(x): continue
        csv_entry = userid + tab
        if 'status' in x[-1]:
            csv_entry += x[0]['current_time'] + tab+ x[-1]['id'] + tab + x[-1]['status']
        else:
            tmpid = x[-1]['id']
            if 'EMA' not in tmpid:
                for y in x:
                    if 'EMA' in y['id']:
                        tmpid = y['id']
                        if 'VALID_BLOCK_' in tmpid:
                            tmpid = tmpid.split('VALID_BLOCK_')[1]
                        break
            else:
                if 'VALID_BLOCK_' in tmpid:
                    tmpid = tmpid.split('VALID_BLOCK_')[1]


            csv_entry += x[-1]['current_time'] + tab+ tmpid + tab + 'NOT_DELIVERED'
        allconds = {}
        for cond in x:
            if 'status' in cond:
                continue
            condition = cond['type'] + '-' + cond['id']
            
            allconds[condition] = cond['message']

        block = -1
        for acond in conditions:
            #print(acond)
            if acond in allconds:
                tmpstr = allconds[acond]
                if 'VALID_BLOCK' in acond:
                    splits = tmpstr.split(':',1)
                    csv_entry += tab + splits[0].strip() + tab + splits[1].strip()

                    blocks = splits[1].split('block(')
                    if len(blocks) > 1:
                        block = splits[1].split('block(')[1][0]
                else:
                    splits = tmpstr.split(':',1)
                    csv_entry += tab + splits[0].strip() + tab + splits[1].strip()  
            elif acond == 'BLOCK':
                csv_entry += tab + str(block)
                continue
            else:
                csv_entry += tab + tab 
            #print(repr(csv_entry))
        csv_entry += '\n'
        if csv_entry not in dup_list:
            csvbuf += csv_entry
            dup_list.append(csv_entry)
        else:
            print('D'*50)

        #exit(1)

    return csvbuf
Beispiel #20
0
from bz2 import BZ2File as bzopen

## Find all IPs that are aliases with both sides of the SACS link in January 2019
ITDK_folders = ['/data/topology/ITDK/ITDK-2019-01/']


### Add list of potential IPs here
#Brazil
Potential_IPs_RB = ['170.238.233.58', '170.238.233.56', '170.238.232.66', '170.238.232.82', '170.238.232.145']
#Angola
Potential_IPs_RA = ['170.238.232.146', '197.149.149.162', '170.238.232.150', '170.238.232.155']


for ITDK_folder in ITDK_folders:
    # reading a bz2 archive
    with bzopen(ITDK_folder + "kapar-midar-iff.nodes.bz2", "r") as bzfin:
        """ Handle lines here """
        lines_RB = []
        List_IPs_RB = []
        
        lines_RA = []
        List_IPs_RA = []
        
        for i, line in enumerate(bzfin):
            """ look for nodes containing potential_IPs_RB """
            for link_IP in Potential_IPs_RB:
                if link_IP in line and line.rstrip() not in lines_RB:
                    lines_RB.append(line.rstrip())
        
            """ look for nodes containing potential_IPs_RA """
            for link_IP in Potential_IPs_RA:
Beispiel #21
0
def main():
    # <http://www.wikidata.org/entity/Q47133351> <http://www.wikidata.org/prop/direct/P356> "10.1002/EJP.1050" .
    REGEX = r'^<http:\/\/www\.wikidata\.org\/entity\/(Q\d+)> <http:\/\/www.wikidata.org\/prop\/direct\/(P\d+)> "(.*?)" \.$'
    manifest = ['P356', 'P698', 'P932', 'P2880']
    dump_location = 'https://dumps.wikimedia.org/wikidatawiki/entities/latest-truthy.nt.bz2'
    to_add = {x: [] for x in manifest}

    print("Downloading latest dump")
    with requests.get(dump_location, stream=True) as r:
        print("Saving dump")
        r.raise_for_status()
        with open('/tmp/latest-truthy.nt.bz2', 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print("Opening dump")
    with bzopen('/tmp/latest-truthy.nt.bz2', 'r') as f:
        for line in f:
            line = line.decode('utf-8')
            match = re.match(REGEX, line)
            if match is None:
                continue

            wd_item  = match.group(1)
            wd_prop  = match.group(2)
            wd_value = match.group(3)

            if wd_prop in manifest:
                #print('Up to', wd_item, end='\r')
                to_add[wd_prop].append((wd_item, wd_value))

                if len(to_add[wd_prop]) >= 10000:
                    print('\nSaving to Redis')

                    wikidata_to_x = {x[0]: x[1] for x in to_add[wd_prop]}
                    x_to_wikidata = {x[1]: x[0] for x in to_add[wd_prop]}

                    REDIS.hmset(
                        '{0}_to_wikidata_{1}'.format(wd_prop, today),
                        x_to_wikidata)
                    REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today),
                        wikidata_to_x)

                    to_add[wd_prop] = []

    # If there are leftovers
    for wd_prop, tuplelist in to_add.items():
        wikidata_to_x = {x[0]: x[1] for x in tuplelist}
        x_to_wikidata = {x[1]: x[0] for x in tuplelist}

        REDIS.hmset(
            '{0}_to_wikidata_{1}'.format(wd_prop, today),
            x_to_wikidata)
        REDIS.hmset('wikidata_to_{0}_{1}'.format(wd_prop, today),
            wikidata_to_x)

    # Finalize
    for wd_prop in manifest:
        REDIS.rename('{0}_to_wikidata_{1}'.format(wd_prop, today),
            '{0}_to_wikidata'.format(wd_prop))
        REDIS.rename('wikidata_to_{0}_{1}'.format(wd_prop, today),
            'wikidata_to_{0}'.format(wd_prop))
Beispiel #22
0
filename_mask = "output/AA/wiki_%.2d.bz2"
num_range = 44

regex = "([^ ]+ 1[0-9]{3} [^ ]+)"

pres = defaultdict(int)
posts = defaultdict(int)

for x in range(num_range):
    if "%" in filename_mask:
        filename = filename_mask % x
    else:
        filename = filename_mask
    print('Filename:', filename)
    with bzopen(filename) as bzin:
        for line in bzin:
            try:
                line = line.decode('utf-8')

                if line[-1] == '/n':
                    line = line[:-1]
                begin, end = finde_ausdruck(regex, line)
                if end:
                    exp = line[begin:end]
                    split = exp.split()
                    pres[split[0]] += 1
                    posts[split[2]] += 1

            except:
                print("Error in line:", line)
            if max_files != -1:
                if files_processed > max_files:
                    pickle.dump( charger_data, open( "save_charger_data.p", "wb" ) )

                    print("exiting max files processed,saving picklefile")
                    exit()
            files_processed += 1


            end = f_name.find('.charging-locations.kml.bz2')
            time_str = f_name[end-8:end]
            date_str = f_name[end-19:end-9]
            date_str = date_str.replace('_','/')
            #print(date_str + ' ' + time_str)
            # reading a bz2 archive
            with bzopen(f_name, "r") as bzfin:
                kml_data = bzfin.read()
                try:
                    kml_root = pykml.parser.fromstring(kml_data)
                    place_list = kml_root.Document.Placemark
                except:
                    print(('ERROR:Catpured parce error for file' + f_name))
                    place_list = []

                for i in place_list:
                    name = str(i.name).encode('utf-8')

                    coord  =i.Point.coordinates
                    desc = str(i.description).encode('utf-8')