def put_data_into_couchdb(db_json, grid_json, data_json):

    with open(db_json) as f:
        db_info = f.read()
    db_info = json.loads(db_info)

    couch_user = db_info['user']
    couch_password = db_info['password']
    couch_host = db_info['host']
    couch_port = db_info['port']
    db_name = db_info['processed_database']
    raw_db_name = db_info['raw_database']
    # source_url = db_info['tweet_source']
    host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str(
        couch_port)

    couch = couchdb.Server(host_and_port)
    try:
        db = couch.create(db_name)  # create db
    except:
        db = couch[db_name]  # existing

    try:
        raw_db = couch[raw_db_name]  # existing
    except:
        raw_db = couch.create(raw_db_name)  # create db

    with open(grid_json) as f:
        grids_str = f.read()
    suburbs = json.loads(grids_str)

    f = open(data_json, encoding='utf-8')
    f.readline()
    count = 0
    while True:
        print(count)
        process_data = []
        raw_data = []
        for i in range(500):
            line = f.readline().strip("\n").strip()
            if line[-1] == ",":
                line = line[:-1]
            try:
                tweet = json.loads(line)
            except:
                raw_db.update(raw_data)
                db.update(process_data)
                print(count)
                print("Done.")
                exit()
            twitter = tweet['doc']
            twitter.pop('_rev')
            raw_data.append(twitter)
            info = sentiment_polarity(twitter, suburbs)
            if info != None:
                process_data.append(info)
            count += 1
        raw_db.update(raw_data)
        db.update(process_data)
    def output2couchdb(self,data):
        host_and_port = "http://" + self.couch_user + ":" + self.couch_password + "@" + self.couch_host + ":" + str(self.couch_port)
        couch = couchdb.Server(host_and_port)
        try:
            db = couch[self.db_name]  # existing
        except:
            db = couch.create(self.db_name)  # create db

        try:
            raw_db = couch[self.raw_db_name]  # existing
        except:
            raw_db = couch.create(self.raw_db_name)  # create db

        twitter = json.loads(data.encode('utf-8'))
        raw_db[twitter['id_str']] = twitter
        info = sentiment_polarity(twitter, self.suburbs)
        process_data = []
        if info != None:
            process_data.append(info)
        db.update(process_data)
def get_tweets(db_json, access_json, grid_json, circle):
    with open(db_json) as f:
        db_info = f.read()
    db_info = json.loads(db_info)

    with open(grid_json) as f:
        grids_str = f.read()
    suburbs = json.loads(grids_str)

    couch_user = db_info['user']
    couch_password = db_info['password']
    couch_host = db_info['host']
    couch_port = db_info['port']
    db_name = db_info['processed_database']
    raw_db_name = db_info['raw_database']
    # source_url = db_info['tweet_source']
    host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str(
        couch_port)

    couch = couchdb.Server(host_and_port)
    try:
        db = couch[db_name]  # existing
    except:
        db = couch.create(db_name)  # create db

    try:
        raw_db = couch[raw_db_name]  # existing
    except:
        raw_db = couch.create(raw_db_name)  # create db

    api = tweepy.API(get_authorization(access_json), wait_on_rate_limit=True)
    for status in tweepy.Cursor(api.search, q='', geocode=circle).items():
        # data = status._json
        twitter = status._json
        raw_db[twitter['id_str']] = twitter
        info = sentiment_polarity(twitter, suburbs)
        process_data = []
        if info != None:
            process_data.append(info)
        db.update(process_data)
Beispiel #4
0
def put_data_into_couchdb(db_json, grid_json, data_json):

    with open(db_json) as f:
        db_info = f.read()
    db_info = json.loads(db_info)

    couch_user = db_info['user']
    couch_password = db_info['password']
    couch_host = db_info['host']
    couch_port = db_info['port']
    db_name = db_info['processed_database']
    raw_db_name = db_info['raw_database']
    # source_url = db_info['tweet_source']
    host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str(
        couch_port)

    couch = couchdb.Server(host_and_port)
    try:
        db = couch.create(db_name)  # create db
    except:
        db = couch[db_name]  # existing

    try:
        raw_db = couch.create(raw_db_name)  # create db
    except:
        raw_db = couch[raw_db_name]  # existing

    with open(grid_json) as f:
        grids_str = f.read()
    suburbs = json.loads(grids_str)

    comm = MPI.COMM_WORLD
    comm_size = comm.Get_size()
    rank = comm.Get_rank()

    if rank == 0:
        # Calculate rough start position by dividing the file size by comm size
        FILE_SIZE = os.path.getsize(data_json)  # Read file size
        rough_start = []
        for r in range(comm_size):
            rough_start.append(r * int(FILE_SIZE / comm_size))
        start = [0] * (comm_size + 1)
        file = open(data_json, "r")  #,encoding='utf-8')

        start[0] = len(file.readline())  # Skip the first line

        # Find the actual start position by moving the file reading pointer to the next line
        for r in range(comm_size - 1):
            file.seek(rough_start[r + 1])
            line_break_position = rough_start[r + 1] + len(
                file.readline())  # Find the next line break position
            start[r + 1] = line_break_position
        start[-1] = FILE_SIZE
        file.close()
    else:
        start = None
    start = comm.bcast(start, root=0)

    file = open(data_json, "r")  #,encoding='utf-8')
    file.seek(
        start[rank])  # Move the file reading pointer to starting position
    file.readline()
    count = 0
    batch_size = 100
    while True:
        process_data = []
        raw_data = []
        for i in range(batch_size):
            try:

                line = file.readline().strip("\n").strip()
                if line == ']}':
                    raise
                if len(line) == 0:
                    continue
                if line[-1] == ",":
                    line = line[:-1]
                tweet = json.loads(line)
                twitter = tweet['doc']
                twitter.pop('_rev')
                raw_data.append(twitter)
                info = sentiment_polarity(twitter, suburbs)
                if info != None:
                    process_data.append(info)
                count += 1
                print(file.tell())
                if file.tell() >= start[rank + 1]:
                    raise
            except:
                raw_db.update(raw_data)
                db.update(process_data)
                print(count)
                print("Done.")
                file.close()
                exit()
        raw_db.update(raw_data)
        db.update(process_data)
Beispiel #5
0
def put_data_into_couchdb(db_json,grid_json,data_json):

    with open(db_json) as f:
        db_info = f.read()
    db_info = json.loads(db_info)

    couch_user = db_info['user']
    couch_password = db_info['password']
    couch_host = db_info['host']
    couch_port = db_info['port']
    db_name = db_info['processed_database']
    raw_db_name = db_info['raw_database']
    # source_url = db_info['tweet_source']
    host_and_port = "http://" +couch_user+":"+couch_password+"@"+couch_host + ":" + str(couch_port)

    couch = couchdb.Server(host_and_port)
    try:
        db = couch.create(db_name)  # create db
    except:
        db = couch[db_name]  # existing

    try:
        raw_db = couch[raw_db_name]  # existing
    except:
        raw_db = couch.create(raw_db_name)  # create db


    with open(grid_json) as f:
        grids_str = f.read()
    suburbs = json.loads(grids_str)

    with open(data_json,'r',encoding='utf-8') as f:
        data = f.read()
    geocoded_tweets = json.loads(data)

    offset = 0
    limit = 500

    while offset + limit <= len(geocoded_tweets['rows']):
        print (offset)
        try:
            process_data = []
            raw_data = []
            for tweet in geocoded_tweets['rows'][offset:offset+limit]:
                twitter = tweet['doc']
                twitter.pop('_rev')
                raw_data.append(twitter)
                info = sentiment_polarity(twitter, suburbs)
                if info != None:
                    process_data.append(info)
            raw_db.update(raw_data)
            db.update(process_data)
            offset += limit
            if offset + limit > len(geocoded_tweets['rows']):
                limit = len(geocoded_tweets['rows']) - offset
            if limit == 0:
                break

        except:
            pass
    print (offset)
    print ("Done.")
def put_data_into_couchdb(db_json, grid_json, start, end):

    with open(db_json) as f:
        db_info = f.read()
    db_info = json.loads(db_info)

    couch_user = db_info['user']
    couch_password = db_info['password']
    couch_host = db_info['host']
    couch_port = db_info['port']
    db_name = db_info['processed_database']
    raw_db_name = db_info['raw_database']
    source_url = db_info['tweet_source']
    host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str(
        couch_port)

    couch = couchdb.Server(host_and_port)
    try:
        db = couch.create(db_name)  # create db
    except:
        db = couch[db_name]  # existing

    try:
        raw_db = couch[raw_db_name]  # existing
    except:
        raw_db = couch.create(raw_db_name)  # create db

    with open(grid_json) as f:
        grids_str = f.read()
    suburbs = json.loads(grids_str)

    total_rows = get_geocoded_tweets(source_url,
                                     start_key=start,
                                     end_key=end,
                                     skip=0,
                                     limit=1,
                                     auth=auth)['total_rows']

    limit = 100
    for i in range(int(total_rows / limit)):
        skip = str(i * limit)
        try:
            geocoded_tweets = get_geocoded_tweets(source_url,
                                                  start_key=start,
                                                  end_key=end,
                                                  skip=skip,
                                                  limit=limit,
                                                  auth=auth)
            print(len(geocoded_tweets['rows']))
            process_data = []
            raw_data = []
            for tweet in geocoded_tweets['rows']:
                twitter = tweet['doc']
                twitter.pop('_rev')
                raw_data.append(twitter)
                info = sentiment_polarity(twitter, suburbs)
                if info != None:
                    process_data.append(info)
            raw_db.update(raw_data)
            db.update(process_data)
        except:
            pass

    try:
        geocoded_tweets = get_geocoded_tweets(source_url,
                                              start_key=start,
                                              end_key=end,
                                              skip=skip,
                                              auth=auth)
        process_data = []
        raw_data = []
        for tweet in geocoded_tweets['rows']:
            twitter = tweet['doc']
            # Remove _rev so the data can be stored in couchdb
            twitter.pop('_rev')
            raw_data.append(twitter)
            info = sentiment_polarity(twitter, suburbs)
            if info != None:
                process_data.append(info)
        raw_db.update(raw_data)
        db.update(process_data)
    except:
        pass