def main():
    print ('Start!')
    conf = SparkConf().setAppName("pyspark_test")
    sc = SparkContext(conf=conf)
    rdd = sc.mongoRDD('mongodb://localhost:27017/test_database.transactions')
    rdd.saveToMongoDB('mongodb://localhost:27017/test_database.transactions_copy')
    print ('Completed!')
Beispiel #2
0
def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)

    # Create an RDD backed by the MongoDB collection.
    # This RDD *does not* contain key/value pairs, just documents.
    # If you want key/value pairs, use the mongoPairRDD method instead.
    rdd = sc.mongoRDD('mongodb://localhost:27017/db.contextizer')
Beispiel #3
0
def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)

    # Create an RDD backed by the MongoDB collection.
    # This RDD *does not* contain key/value pairs, just documents.
    # If you want key/value pairs, use the mongoPairRDD method instead.
    rdd = sc.mongoRDD('mongodb://localhost:27017/db.contextizer')
Beispiel #4
0
def main(args):

    # get conf =============================================================================
    conf = getConf()

    db_host = conf['host']
    db_port = int(conf['port'])
    directory = conf['txt_directory']
    db_name = conf['db_name']
    collection_name_urls = conf['url_collection']
    collection_name_dbstat = conf['dbstat_collection']
    phase1_n_threads = int(conf['geo_indexing_nthread'])
    max_waiting_time = int(conf['max_waiting_time_http'])
    s = int(conf['s'])

    min_loc = None
    max_loc = None
    if conf['bounded_locs'] != "":
        bounded_locs = conf['bounded_locs']
        min_loc, max_loc = bounded_locs[0], bounded_locs[1]
    else:
        min_loc, max_loc = d.getBoundaries(host, port, db_name,
                                           dbstat_collection_name)

    #========================================================================================

    logs = {}
    # links extraction

    #logs['m1'] = m1.run(db_host, db_port, directory, db_name, collection_name_urls, collection_name_dbstat, phase1_n_threads)

    # Spark context definition
    conf = SparkConf()
    conf.setMaster("local")
    conf.setAppName("Test Spark")
    conf.set("spark.executor.memory", "1g")
    sc = SparkContext(conf=conf)

    # get urls for the map

    # set up parameters for reading from MongoDB via Hadoop input format

    db_conf = "mongodb://" + db_host + ":" + str(db_port) + "/" + db_name + "."
    db_conf_clicks = db_conf + collection_name_urls

    print(db_conf_clicks)

    # Read from DB
    urlsRDD = sc.mongoRDD(db_conf_clicks)

    # Map Reduce
    a = urlsRDD.map(lambda x: f_download(x,max_waiting_time)).\
     map(lambda x: f_parse(x)).\
     map(lambda x: f_cellIndex(x, min_loc, max_loc, s)).\
     collect()

    print('\n\n\n\n\n\nFINITO\n\n\n\n\n\n\n')
def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)


    config = ConfigParser.ConfigParser()
    config.read('configuration.cfg')
    mongodb_connection = config.get('BatchProperties', 'URLMongoDB')

    #######################################################
    # UTILIZACION DE LA LIBRERIA DE PYMONGO
    #######################################################
    client = MongoClient()
    db = client.test

    cursor = db.tabla1.find()

    for document in cursor:
        print(document)


    #######################################################
    # UTILIZACION DE LA LIBRERIA DE pymongo_spark
    #######################################################
    # Lectura de una tabla de mongodb (db: test; coleccion: tabla1)
    rdd = sc.mongoRDD(mongodb_connection + 'test.tabla1')

    # Guardamos el rdd leido en mongodb (db: test; coleccion: tabla2)
    rdd.saveToMongoDB(mongodb_connection + 'test.tabla2')

    # Recuperamos el valor de raiz del proyecto
    BASE_DIR = os.path.dirname(os.path.dirname(__file__))
    # BASE_DIR = /Users/akash/PycharmProjects/masterbigdata

    # Leemos un fichero de ejemplo
    file = os.path.join(BASE_DIR + '/datasets/batch/air', 'ficheroSalidaAire.txt')

    rddfFile = sqlContext.jsonFile(file)

     # Almancemos en mongodb el fichero
    rddfFile.saveToMongoDB(mongodb_connection + 'test.tabla3')
def main():
    conf = SparkConf().setAppName("transform")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)
    conn = "mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}".format(
        mongo_user=MONGO_USER,
        mongo_pass=urllib.quote_plus(MONGO_PASSWORD),
        mongo_host=MONGO_HOST,
        mongo_port=MONGO_PORT,
        mongo_db=DB_NAME,
        mongo_collection=COLLECTION_NAME)
    rdd = sc.mongoRDD(conn)

    new_rdd = rdd.map(lambda x: dict([(i, x[i]) for i in x if i != '_id'])
                ).map(lambda x: json.dumps(x, ensure_ascii=False).encode('ascii', 'replace')
                ).map(lambda x: "".join(x.split("\\n")))
    df = sqlContext.jsonRDD(new_rdd)
    df.registerTempTable('events_temp')
    sqlContext.sql('DROP TABLE IF EXISTS default.events')
    sqlContext.sql('DROP TABLE IF EXISTS default.clean_table')
    sqlContext.sql('CREATE TABLE events AS SELECT * FROM events_temp')
    sqlContext.sql("CREATE TABLE clean_table AS SELECT description AS event_desc, id AS event_id, yes_rsvp_count, group.category.name AS cat_name, group.category.shortname AS cat_short, group.category.id AS cat_id, group.name AS group_name, group.topics.name AS topic_name, name AS event_name, time AS start_time, utc_offset AS timezone_offset, venue.state AS venue_state, venue.city AS venue_city, venue.zip AS venue_zip, fee.amount AS fee_amt, fee.required AS req_fee FROM events")
start_time = time.time()

client = MongoClient('localhost',27017)

utc=pytz.UTC
#datetime.datetime.now().replace(tzinfo=utc)


db = client['disaster']
minuteAnalysisLatest = db['minute']

pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.analysisData')


#Objective 1: Get the number of times the key words(222) are used for 1 particular day , For every minute.
dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc)
incrementByAMinute = datetime.timedelta(minutes=1)
incrementByADay = datetime.timedelta(days=1)
dayOneEnd = dayOne + incrementByADay
dayOneEnd.replace(tzinfo=utc)
contentRdd = rdd.map(lambda x: (x['text'],x['created_at'])).filter(lambda (x,y): y > dayOne and y < dayOneEnd).persist()

#count = {'bomb': 0, 'violent storm': 0, 'hijacker': 0, 'bombed': 0, 'sunk': 0, 'avalanche': 0, 'debris': 0, 'body bag': 0, 'battle': 0, 'fear': 0, 'weapons': 0, 'catastrophe': 0, 'forest fire': 0, 'ruin': 0, 'buildings burning': 0, 'blaze': 0, 'fatal': 0, 'airplane accident': 0, 'sinking': 0, 'electrocute': 0, 'rescue': 0, 'hostage': 0, 'massacre': 0, 'traumatised': 0, 'trouble': 0, 'screaming': 0, 'suicide bomb': 0, 'annihilated': 0, 'loud bang': 0, 'floods': 0, 'quarantine': 0, 'obliterate': 0, 'cliff fall': 0, 'body bagging': 0, 'snowstorm': 0, 'whirlwind': 0, 'disaster': 0, 'bleeding': 0, 'razed': 0, 'famine': 0, 'armageddon': 0, 'wreck': 0, 'thunder': 0, 'wrecked': 0, 'crush': 0, 'burned': 0, 'sirens': 0, 'explosion': 0, 'screams': 0, 'rescuers': 0, 'bridge collapse': 0, 'survivors': 0, 'fatality': 0, 'earthquake': 0, 'accident': 0, 'flames': 0, 'detonate': 0, 'mass murderer': 0, 'smoke': 0, 'military': 0, 'stretcher': 0, 'blizzard': 0, 'danger': 0, 'bloody': 0, 'panicking': 0, 'drowned': 0, 'eyewitness': 0, 'devastation': 0, 'bush fires': 0, 'army': 0, 'heat wave': 0, 'emergency plan': 0, 'tragedy': 0, 'collided': 0, 'survive': 0, 'injury': 0, 'riot': 0, 'attacked': 0, 'fire': 0, 'bioterrorism': 0, 'wounds': 0, 'quarantined': 0, 'drown': 0, 'hailstorm': 0, 'casualties': 0, 'mass murder': 0, 'demolish': 0, 'collision': 0, 'pandemonium': 0, 'sandstorm': 0, 'electrocuted': 0, 'landslide': 0, 'flooding': 0, 'mayhem': 0, 'rainstorm': 0, 'demolition': 0, 'blew up': 0, 'hijacking': 0, 'siren': 0, 'terrorist': 0, 'inundated': 0, 'damage': 0, 'lava': 0, 'devastated': 0, 'forest fires': 0, 'outbreak': 0, 'terrorism': 0, 'panic': 0, 'detonation': 0, 'injured': 0, 'deluged': 0, 'windstorm': 0, 'thunderstorm': 0, 'hazard': 0, 'crushed': 0, 'crashed': 0, 'blood': 0, 'buildings on fire': 0, 'destruction': 0, 'deluge': 0, 'weapon': 0, 'sinkhole': 0, 'aftershock': 0, 'ambulance': 0, 'wreckage': 0, 'desolate': 0, 'blown up': 0, 'fatalities': 0, 'injuries': 0, 'bombing': 0, 'structural failure': 0, 'death': 0, 'police': 0, 'destroyed': 0, 'engulfed': 0, 'crash': 0, 'emergency': 0, 'inundation': 0, 'collide': 0, 'blight': 0, 'destroy': 0, 'dust storm': 0, 'mudslide': 0, 'displaced': 0, 'arsonist': 0, 'nuclear reactor': 0, 'blazing': 0, 'lightning': 0, 'explode': 0, 'tsunami': 0, 'burning buildings': 0, 'volcano': 0, 'hijack': 0, 'refugees': 0, 'derailment': 0, 'harm': 0, 'hail': 0, 'bioterror': 0, 'hurricane': 0, 'trauma': 0, 'evacuation': 0, 'cyclone': 0, 'epicentre': 0, 'nuclear disaster': 0, 'hostages': 0, 'obliteration': 0, 'suicide bomber': 0, 'drowning': 0, 'derailed': 0, 'threat': 0, 'apocalypse': 0, 'chemical emergency': 0, 'burning': 0, 'obliterated': 0, 'screamed': 0, 'fire truck': 0, 'seismic': 0, 'wildfire': 0, 'emergency services': 0, 'attack': 0, 'storm': 0, 'catastrophic': 0, 'twister': 0, 'evacuated': 0, 'natural disaster': 0, 'collapse': 0, 'trapped': 0, 'war zone': 0, 'exploded': 0, 'collapsed': 0, 'oil spill': 0, 'evacuate': 0, 'typhoon': 0, 'dead': 0, 'survived': 0, 'first responders': 0, 'keyword': 0, 'radiation emergency': 0, 'annihilation': 0, 'deaths': 0, 'rubble': 0, 'ablaze': 0, 'meltdown': 0, 'casualty': 0, 'body bags': 0, 'upheaval': 0, 'flood': 0, 'demolished': 0, 'rioting': 0, 'hellfire': 0, 'curfew': 0, 'hazardous': 0, 'tornado': 0, 'desolation': 0, 'flattened': 0, 'drought': 0, 'derail': 0, 'arson': 0, 'rescued': 0, 'suicide bombing': 0, 'wild fires': 0, 'wounded': 0}

# for issue in count.keys():
#     print issue
def getCount(content):
import pytz
import time
from operator import add

from pymongo import MongoClient

start_time = time.time()
client = MongoClient('localhost',27017)
utc=pytz.UTC
db = client['disaster']
threeHourlyAlert = db['minute']

pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.overAll10MinuteAverage').persist()


dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc)
incrementBy3Hour= datetime.timedelta(hours=2)

for x in range(288):
    dayOneIncrementBy3Hour = dayOne + incrementBy3Hour
    dayOneIncrementBy3Hour = dayOneIncrementBy3Hour.replace(tzinfo=utc)
    output = rdd.filter( lambda x: x['date'] >= dayOne and x['date'] < dayOneIncrementBy3Hour ).flatMap(lambda x: x['average'].items()).filter(lambda (x,y): y > 8 ).map(lambda (x,y): (x,1)).reduceByKey(lambda x,y:x+y).filter(lambda (x,y): y>8).map(lambda(x,y): x).collect()
    if output != []:
        result = db.threeHourlyAlert.insert_one({"date": dayOne , "count":output})
    dayOne = dayOneIncrementBy3Hour.replace(tzinfo=utc)

start_time = time.time()

client = MongoClient('localhost', 27017)

utc = pytz.UTC
#datetime.datetime.now().replace(tzinfo=utc)

db = client['disaster']
minuteAnalysisLatest = db['minute']

pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.analysisData')

#Objective 1: Get the number of times the key words(222) are used for 1 particular day , For every minute.
dayOne = datetime.datetime(2016, 3, 24, 0, 0, 0).replace(tzinfo=utc)
incrementByAMinute = datetime.timedelta(minutes=1)
incrementByADay = datetime.timedelta(days=1)
dayOneEnd = dayOne + incrementByADay
dayOneEnd.replace(tzinfo=utc)
contentRdd = rdd.map(lambda x: (x['text'], x['created_at'])).filter(
    lambda (x, y): y > dayOne and y < dayOneEnd).persist()

#count = {'bomb': 0, 'violent storm': 0, 'hijacker': 0, 'bombed': 0, 'sunk': 0, 'avalanche': 0, 'debris': 0, 'body bag': 0, 'battle': 0, 'fear': 0, 'weapons': 0, 'catastrophe': 0, 'forest fire': 0, 'ruin': 0, 'buildings burning': 0, 'blaze': 0, 'fatal': 0, 'airplane accident': 0, 'sinking': 0, 'electrocute': 0, 'rescue': 0, 'hostage': 0, 'massacre': 0, 'traumatised': 0, 'trouble': 0, 'screaming': 0, 'suicide bomb': 0, 'annihilated': 0, 'loud bang': 0, 'floods': 0, 'quarantine': 0, 'obliterate': 0, 'cliff fall': 0, 'body bagging': 0, 'snowstorm': 0, 'whirlwind': 0, 'disaster': 0, 'bleeding': 0, 'razed': 0, 'famine': 0, 'armageddon': 0, 'wreck': 0, 'thunder': 0, 'wrecked': 0, 'crush': 0, 'burned': 0, 'sirens': 0, 'explosion': 0, 'screams': 0, 'rescuers': 0, 'bridge collapse': 0, 'survivors': 0, 'fatality': 0, 'earthquake': 0, 'accident': 0, 'flames': 0, 'detonate': 0, 'mass murderer': 0, 'smoke': 0, 'military': 0, 'stretcher': 0, 'blizzard': 0, 'danger': 0, 'bloody': 0, 'panicking': 0, 'drowned': 0, 'eyewitness': 0, 'devastation': 0, 'bush fires': 0, 'army': 0, 'heat wave': 0, 'emergency plan': 0, 'tragedy': 0, 'collided': 0, 'survive': 0, 'injury': 0, 'riot': 0, 'attacked': 0, 'fire': 0, 'bioterrorism': 0, 'wounds': 0, 'quarantined': 0, 'drown': 0, 'hailstorm': 0, 'casualties': 0, 'mass murder': 0, 'demolish': 0, 'collision': 0, 'pandemonium': 0, 'sandstorm': 0, 'electrocuted': 0, 'landslide': 0, 'flooding': 0, 'mayhem': 0, 'rainstorm': 0, 'demolition': 0, 'blew up': 0, 'hijacking': 0, 'siren': 0, 'terrorist': 0, 'inundated': 0, 'damage': 0, 'lava': 0, 'devastated': 0, 'forest fires': 0, 'outbreak': 0, 'terrorism': 0, 'panic': 0, 'detonation': 0, 'injured': 0, 'deluged': 0, 'windstorm': 0, 'thunderstorm': 0, 'hazard': 0, 'crushed': 0, 'crashed': 0, 'blood': 0, 'buildings on fire': 0, 'destruction': 0, 'deluge': 0, 'weapon': 0, 'sinkhole': 0, 'aftershock': 0, 'ambulance': 0, 'wreckage': 0, 'desolate': 0, 'blown up': 0, 'fatalities': 0, 'injuries': 0, 'bombing': 0, 'structural failure': 0, 'death': 0, 'police': 0, 'destroyed': 0, 'engulfed': 0, 'crash': 0, 'emergency': 0, 'inundation': 0, 'collide': 0, 'blight': 0, 'destroy': 0, 'dust storm': 0, 'mudslide': 0, 'displaced': 0, 'arsonist': 0, 'nuclear reactor': 0, 'blazing': 0, 'lightning': 0, 'explode': 0, 'tsunami': 0, 'burning buildings': 0, 'volcano': 0, 'hijack': 0, 'refugees': 0, 'derailment': 0, 'harm': 0, 'hail': 0, 'bioterror': 0, 'hurricane': 0, 'trauma': 0, 'evacuation': 0, 'cyclone': 0, 'epicentre': 0, 'nuclear disaster': 0, 'hostages': 0, 'obliteration': 0, 'suicide bomber': 0, 'drowning': 0, 'derailed': 0, 'threat': 0, 'apocalypse': 0, 'chemical emergency': 0, 'burning': 0, 'obliterated': 0, 'screamed': 0, 'fire truck': 0, 'seismic': 0, 'wildfire': 0, 'emergency services': 0, 'attack': 0, 'storm': 0, 'catastrophic': 0, 'twister': 0, 'evacuated': 0, 'natural disaster': 0, 'collapse': 0, 'trapped': 0, 'war zone': 0, 'exploded': 0, 'collapsed': 0, 'oil spill': 0, 'evacuate': 0, 'typhoon': 0, 'dead': 0, 'survived': 0, 'first responders': 0, 'keyword': 0, 'radiation emergency': 0, 'annihilation': 0, 'deaths': 0, 'rubble': 0, 'ablaze': 0, 'meltdown': 0, 'casualty': 0, 'body bags': 0, 'upheaval': 0, 'flood': 0, 'demolished': 0, 'rioting': 0, 'hellfire': 0, 'curfew': 0, 'hazardous': 0, 'tornado': 0, 'desolation': 0, 'flattened': 0, 'drought': 0, 'derail': 0, 'arson': 0, 'rescued': 0, 'suicide bombing': 0, 'wild fires': 0, 'wounded': 0}


# for issue in count.keys():
#     print issue
Beispiel #10
0
def main():
    conf = SparkConf().setAppName("pyspark read")
    sc = SparkContext(conf=conf)
    mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/estreaming.splash')
    print(mongo_rdd.first())
    "#AugmentedReality", "#BigData", "#DevOps"
]
hashtags_lowercased = [
    "#ai", "#artificialintelligence", "#machinelearning", "#ml",
    "#deeplearning", "#dl", "#datamining", "#vr", "#virtualreality", "#ar",
    "#augmentedreality", "#bigdata", "#devops"
]


def get_hashtag(x):
    if x.lower() in hashtags_lowercased:
        for hashtag in hashtags:
            if hashtag.lower() == x.lower():
                return hashtag


# Important: activate pymongo_spark.
pymongo_spark.activate()

conf = SparkConf().setAppName('SparkBatch').setMaster('local[2]')
sc = SparkContext(conf=conf)

#Reading
mongo_rdd = sc.mongoRDD('mongodb://*****:*****@ds129540.mlab.com:29540/bigdata.tweets')\
          .map(lambda x: json.loads(x['value'])['full_text'])\
          .flatMap(lambda x: x.split()) \
          .filter(lambda x: x.lower() in hashtags_lowercased) \
          .map(lambda x: (get_hashtag(x),1)) \
          .reduceByKey(lambda x,y:x+y) \
          .saveToMongoDB('mongodb://*****:*****@ds129540.mlab.com:29540/bigdata.hashtags')
Beispiel #12
0
            print doc["_id"]
            
                        
            
        
if __name__ == '__main__':
    pymongo_spark.activate()
    start_time=datetime.time()
    conf = (SparkConf()
    .setAppName("LinkingPipeLine"))
    
    sc = SparkContext(conf=conf,pyFiles=['/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/LinkPipeMethods.py',
                                            '/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/tagText.py'])
    
    #setting up RDD    
    rdd = sc.mongoRDD('mongodb://10.1.1.5:27017/GaugeDB.test_judgments')
        #Filtering Criteria for RDD
    filterRDD = rdd.filter(lambda x : True if x["pipefinal"] == 1  else False)
    
    #Config paths
    path1 = "/usr/linkPipModels/CRF-Model-OnlyCodes"
    path2 = "/usr/linkPipModels/CRF-Model-OnlyTitles"
    path3 = "/usr/linkPipModels/VectorSpaceTitles_word.p"
    path4 = "/usr/linkPipModels/VectorSpaceCodes.p"
    path5 = "/usr/linkPipModels/Tf-IdfOnlytitles.p"
    path6 = "/usr/linkPipModels/Tf-IdfCitationCodes.p"
    path7 = "/usr/linkPipModels/TitleClassifier.p"
    path8 ="/usr/linkPipModels/JournalDictForStep1.p"
    path9 ="/usr/linkPipModels/JournalDictForStep2.p"