Python SparkContext.mongoRDD Beispiele

Programmiersprache: Python

Namespace / Paketname: pyspark

Klasse / Typ: SparkContext

Methode / Funktion: mongoRDD

Beispiele auf hotexamples.com: 12

Python SparkContext.mongoRDD - 12 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die pyspark.SparkContext.mongoRDD, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

setLogLevel(30)

setSystemProperty(30)

setCheckpointDir(30)

getConf(30)

parallelize(30)

pickleFile(30)

broadcast(30)

emptyRDD(30)

newAPIHadoopFile(30)

binaryFiles(30)

addPyFile(30)

addFile(30)

accumulator(30)

getOrCreate(30)

SparkContext(30)

sequenceFile(30)

newAPIHadoopRDD(25)

_ensure_initialized(14)

createDataFrame(11)

hadoopFile(10)

show_profiles(9)

range(8)

dump_profiles(6)

mongoRDD(6)

binaryRecords(6)

map(4)

setLocalProperty(3)

runJob(3)

flatMap(2)

cassandraTable(2)

collect(2)

close(2)

setJobGroup(2)

paralellize(1)

neo4jTable(1)

neo4jConfig(1)

parallelise(1)

BSONFileRDD(1)

parallelized(1)

parallize(1)

reduceByKey(1)

sample(1)

mongoPairRDD(1)

setMaster(1)

show_profile(1)

sortBy(1)

saveAsTextFile(1)

hadoopConfiguration(1)

mixin(1)

filter(1)

Beispiel #1

Datei anzeigen

Datei: rdd.py Projekt: jingxian0320/Cassandra-Spark-MongoDB-Connection

def main():
    print ('Start!')
    conf = SparkConf().setAppName("pyspark_test")
    sc = SparkContext(conf=conf)
    rdd = sc.mongoRDD('mongodb://localhost:27017/test_database.transactions')
    rdd.saveToMongoDB('mongodb://localhost:27017/test_database.transactions_copy')
    print ('Completed!')

Beispiel #2

Datei anzeigen

Datei: pyandmongo.py Projekt: senthalan/contextizer

def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)

    # Create an RDD backed by the MongoDB collection.
    # This RDD *does not* contain key/value pairs, just documents.
    # If you want key/value pairs, use the mongoPairRDD method instead.
    rdd = sc.mongoRDD('mongodb://localhost:27017/db.contextizer')

Beispiel #3

Datei anzeigen

Datei: pyandmongo.py Projekt: senthalan/contextizer

def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)

    # Create an RDD backed by the MongoDB collection.
    # This RDD *does not* contain key/value pairs, just documents.
    # If you want key/value pairs, use the mongoPairRDD method instead.
    rdd = sc.mongoRDD('mongodb://localhost:27017/db.contextizer')

Beispiel #4

Datei anzeigen

def main(args):

    # get conf =============================================================================
    conf = getConf()

    db_host = conf['host']
    db_port = int(conf['port'])
    directory = conf['txt_directory']
    db_name = conf['db_name']
    collection_name_urls = conf['url_collection']
    collection_name_dbstat = conf['dbstat_collection']
    phase1_n_threads = int(conf['geo_indexing_nthread'])
    max_waiting_time = int(conf['max_waiting_time_http'])
    s = int(conf['s'])

    min_loc = None
    max_loc = None
    if conf['bounded_locs'] != "":
        bounded_locs = conf['bounded_locs']
        min_loc, max_loc = bounded_locs[0], bounded_locs[1]
    else:
        min_loc, max_loc = d.getBoundaries(host, port, db_name,
                                           dbstat_collection_name)

    #========================================================================================

    logs = {}
    # links extraction

    #logs['m1'] = m1.run(db_host, db_port, directory, db_name, collection_name_urls, collection_name_dbstat, phase1_n_threads)

    # Spark context definition
    conf = SparkConf()
    conf.setMaster("local")
    conf.setAppName("Test Spark")
    conf.set("spark.executor.memory", "1g")
    sc = SparkContext(conf=conf)

    # get urls for the map

    # set up parameters for reading from MongoDB via Hadoop input format

    db_conf = "mongodb://" + db_host + ":" + str(db_port) + "/" + db_name + "."
    db_conf_clicks = db_conf + collection_name_urls

    print(db_conf_clicks)

    # Read from DB
    urlsRDD = sc.mongoRDD(db_conf_clicks)

    # Map Reduce
    a = urlsRDD.map(lambda x: f_download(x,max_waiting_time)).\
     map(lambda x: f_parse(x)).\
     map(lambda x: f_cellIndex(x, min_loc, max_loc, s)).\
     collect()

    print('\n\n\n\n\n\nFINITO\n\n\n\n\n\n\n')

Beispiel #5

Datei anzeigen

Datei: TestMongoDB.py Projekt: akashdaswani/masterbigdata

def main():
    conf = SparkConf().setAppName("pyspark test")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)


    config = ConfigParser.ConfigParser()
    config.read('configuration.cfg')
    mongodb_connection = config.get('BatchProperties', 'URLMongoDB')

    #######################################################
    # UTILIZACION DE LA LIBRERIA DE PYMONGO
    #######################################################
    client = MongoClient()
    db = client.test

    cursor = db.tabla1.find()

    for document in cursor:
        print(document)


    #######################################################
    # UTILIZACION DE LA LIBRERIA DE pymongo_spark
    #######################################################
    # Lectura de una tabla de mongodb (db: test; coleccion: tabla1)
    rdd = sc.mongoRDD(mongodb_connection + 'test.tabla1')

    # Guardamos el rdd leido en mongodb (db: test; coleccion: tabla2)
    rdd.saveToMongoDB(mongodb_connection + 'test.tabla2')

    # Recuperamos el valor de raiz del proyecto
    BASE_DIR = os.path.dirname(os.path.dirname(__file__))
    # BASE_DIR = /Users/akash/PycharmProjects/masterbigdata

    # Leemos un fichero de ejemplo
    file = os.path.join(BASE_DIR + '/datasets/batch/air', 'ficheroSalidaAire.txt')

    rddfFile = sqlContext.jsonFile(file)

     # Almancemos en mongodb el fichero
    rddfFile.saveToMongoDB(mongodb_connection + 'test.tabla3')

Beispiel #6

Datei anzeigen

Datei: transform.py Projekt: shankar2016/Meetup-MongoDB-Tableau

def main():
    conf = SparkConf().setAppName("transform")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)
    conn = "mongodb://{mongo_user}:{mongo_pass}@{mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}".format(
        mongo_user=MONGO_USER,
        mongo_pass=urllib.quote_plus(MONGO_PASSWORD),
        mongo_host=MONGO_HOST,
        mongo_port=MONGO_PORT,
        mongo_db=DB_NAME,
        mongo_collection=COLLECTION_NAME)
    rdd = sc.mongoRDD(conn)

    new_rdd = rdd.map(lambda x: dict([(i, x[i]) for i in x if i != '_id'])
                ).map(lambda x: json.dumps(x, ensure_ascii=False).encode('ascii', 'replace')
                ).map(lambda x: "".join(x.split("\\n")))
    df = sqlContext.jsonRDD(new_rdd)
    df.registerTempTable('events_temp')
    sqlContext.sql('DROP TABLE IF EXISTS default.events')
    sqlContext.sql('DROP TABLE IF EXISTS default.clean_table')
    sqlContext.sql('CREATE TABLE events AS SELECT * FROM events_temp')
    sqlContext.sql("CREATE TABLE clean_table AS SELECT description AS event_desc, id AS event_id, yes_rsvp_count, group.category.name AS cat_name, group.category.shortname AS cat_short, group.category.id AS cat_id, group.name AS group_name, group.topics.name AS topic_name, name AS event_name, time AS start_time, utc_offset AS timezone_offset, venue.state AS venue_state, venue.city AS venue_city, venue.zip AS venue_zip, fee.amount AS fee_amt, fee.required AS req_fee FROM events")

Beispiel #7

Datei anzeigen

Datei: getMinuteAnalysisForADay.py Projekt: CUBigDataClass/Disaster-Analysis

start_time = time.time()

client = MongoClient('localhost',27017)

utc=pytz.UTC
#datetime.datetime.now().replace(tzinfo=utc)


db = client['disaster']
minuteAnalysisLatest = db['minute']

pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.analysisData')


#Objective 1: Get the number of times the key words(222) are used for 1 particular day , For every minute.
dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc)
incrementByAMinute = datetime.timedelta(minutes=1)
incrementByADay = datetime.timedelta(days=1)
dayOneEnd = dayOne + incrementByADay
dayOneEnd.replace(tzinfo=utc)
contentRdd = rdd.map(lambda x: (x['text'],x['created_at'])).filter(lambda (x,y): y > dayOne and y < dayOneEnd).persist()

#count = {'bomb': 0, 'violent storm': 0, 'hijacker': 0, 'bombed': 0, 'sunk': 0, 'avalanche': 0, 'debris': 0, 'body bag': 0, 'battle': 0, 'fear': 0, 'weapons': 0, 'catastrophe': 0, 'forest fire': 0, 'ruin': 0, 'buildings burning': 0, 'blaze': 0, 'fatal': 0, 'airplane accident': 0, 'sinking': 0, 'electrocute': 0, 'rescue': 0, 'hostage': 0, 'massacre': 0, 'traumatised': 0, 'trouble': 0, 'screaming': 0, 'suicide bomb': 0, 'annihilated': 0, 'loud bang': 0, 'floods': 0, 'quarantine': 0, 'obliterate': 0, 'cliff fall': 0, 'body bagging': 0, 'snowstorm': 0, 'whirlwind': 0, 'disaster': 0, 'bleeding': 0, 'razed': 0, 'famine': 0, 'armageddon': 0, 'wreck': 0, 'thunder': 0, 'wrecked': 0, 'crush': 0, 'burned': 0, 'sirens': 0, 'explosion': 0, 'screams': 0, 'rescuers': 0, 'bridge collapse': 0, 'survivors': 0, 'fatality': 0, 'earthquake': 0, 'accident': 0, 'flames': 0, 'detonate': 0, 'mass murderer': 0, 'smoke': 0, 'military': 0, 'stretcher': 0, 'blizzard': 0, 'danger': 0, 'bloody': 0, 'panicking': 0, 'drowned': 0, 'eyewitness': 0, 'devastation': 0, 'bush fires': 0, 'army': 0, 'heat wave': 0, 'emergency plan': 0, 'tragedy': 0, 'collided': 0, 'survive': 0, 'injury': 0, 'riot': 0, 'attacked': 0, 'fire': 0, 'bioterrorism': 0, 'wounds': 0, 'quarantined': 0, 'drown': 0, 'hailstorm': 0, 'casualties': 0, 'mass murder': 0, 'demolish': 0, 'collision': 0, 'pandemonium': 0, 'sandstorm': 0, 'electrocuted': 0, 'landslide': 0, 'flooding': 0, 'mayhem': 0, 'rainstorm': 0, 'demolition': 0, 'blew up': 0, 'hijacking': 0, 'siren': 0, 'terrorist': 0, 'inundated': 0, 'damage': 0, 'lava': 0, 'devastated': 0, 'forest fires': 0, 'outbreak': 0, 'terrorism': 0, 'panic': 0, 'detonation': 0, 'injured': 0, 'deluged': 0, 'windstorm': 0, 'thunderstorm': 0, 'hazard': 0, 'crushed': 0, 'crashed': 0, 'blood': 0, 'buildings on fire': 0, 'destruction': 0, 'deluge': 0, 'weapon': 0, 'sinkhole': 0, 'aftershock': 0, 'ambulance': 0, 'wreckage': 0, 'desolate': 0, 'blown up': 0, 'fatalities': 0, 'injuries': 0, 'bombing': 0, 'structural failure': 0, 'death': 0, 'police': 0, 'destroyed': 0, 'engulfed': 0, 'crash': 0, 'emergency': 0, 'inundation': 0, 'collide': 0, 'blight': 0, 'destroy': 0, 'dust storm': 0, 'mudslide': 0, 'displaced': 0, 'arsonist': 0, 'nuclear reactor': 0, 'blazing': 0, 'lightning': 0, 'explode': 0, 'tsunami': 0, 'burning buildings': 0, 'volcano': 0, 'hijack': 0, 'refugees': 0, 'derailment': 0, 'harm': 0, 'hail': 0, 'bioterror': 0, 'hurricane': 0, 'trauma': 0, 'evacuation': 0, 'cyclone': 0, 'epicentre': 0, 'nuclear disaster': 0, 'hostages': 0, 'obliteration': 0, 'suicide bomber': 0, 'drowning': 0, 'derailed': 0, 'threat': 0, 'apocalypse': 0, 'chemical emergency': 0, 'burning': 0, 'obliterated': 0, 'screamed': 0, 'fire truck': 0, 'seismic': 0, 'wildfire': 0, 'emergency services': 0, 'attack': 0, 'storm': 0, 'catastrophic': 0, 'twister': 0, 'evacuated': 0, 'natural disaster': 0, 'collapse': 0, 'trapped': 0, 'war zone': 0, 'exploded': 0, 'collapsed': 0, 'oil spill': 0, 'evacuate': 0, 'typhoon': 0, 'dead': 0, 'survived': 0, 'first responders': 0, 'keyword': 0, 'radiation emergency': 0, 'annihilation': 0, 'deaths': 0, 'rubble': 0, 'ablaze': 0, 'meltdown': 0, 'casualty': 0, 'body bags': 0, 'upheaval': 0, 'flood': 0, 'demolished': 0, 'rioting': 0, 'hellfire': 0, 'curfew': 0, 'hazardous': 0, 'tornado': 0, 'desolation': 0, 'flattened': 0, 'drought': 0, 'derail': 0, 'arson': 0, 'rescued': 0, 'suicide bombing': 0, 'wild fires': 0, 'wounded': 0}

# for issue in count.keys():
#     print issue
def getCount(content):

Beispiel #8

Datei anzeigen

Datei: getDatesOfMajorIncidents.py Projekt: CUBigDataClass/Disaster-Analysis

import pytz
import time
from operator import add

from pymongo import MongoClient

start_time = time.time()
client = MongoClient('localhost',27017)
utc=pytz.UTC
db = client['disaster']
threeHourlyAlert = db['minute']

pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.overAll10MinuteAverage').persist()


dayOne=datetime.datetime(2016, 3, 24, 0, 0 , 0).replace(tzinfo=utc)
incrementBy3Hour= datetime.timedelta(hours=2)

for x in range(288):
    dayOneIncrementBy3Hour = dayOne + incrementBy3Hour
    dayOneIncrementBy3Hour = dayOneIncrementBy3Hour.replace(tzinfo=utc)
    output = rdd.filter( lambda x: x['date'] >= dayOne and x['date'] < dayOneIncrementBy3Hour ).flatMap(lambda x: x['average'].items()).filter(lambda (x,y): y > 8 ).map(lambda (x,y): (x,1)).reduceByKey(lambda x,y:x+y).filter(lambda (x,y): y>8).map(lambda(x,y): x).collect()
    if output != []:
        result = db.threeHourlyAlert.insert_one({"date": dayOne , "count":output})
    dayOne = dayOneIncrementBy3Hour.replace(tzinfo=utc)

Beispiel #9

Datei anzeigen

Datei: getMinuteAnalysisForADay.py Projekt: salamsajid/Disaster-Analysis

start_time = time.time()

client = MongoClient('localhost', 27017)

utc = pytz.UTC
#datetime.datetime.now().replace(tzinfo=utc)

db = client['disaster']
minuteAnalysisLatest = db['minute']

pymongo_spark.activate()
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("pyspark test")
sc = SparkContext(conf=conf)
rdd = sc.mongoRDD('mongodb://localhost:27017/disaster.analysisData')

#Objective 1: Get the number of times the key words(222) are used for 1 particular day , For every minute.
dayOne = datetime.datetime(2016, 3, 24, 0, 0, 0).replace(tzinfo=utc)
incrementByAMinute = datetime.timedelta(minutes=1)
incrementByADay = datetime.timedelta(days=1)
dayOneEnd = dayOne + incrementByADay
dayOneEnd.replace(tzinfo=utc)
contentRdd = rdd.map(lambda x: (x['text'], x['created_at'])).filter(
    lambda (x, y): y > dayOne and y < dayOneEnd).persist()

#count = {'bomb': 0, 'violent storm': 0, 'hijacker': 0, 'bombed': 0, 'sunk': 0, 'avalanche': 0, 'debris': 0, 'body bag': 0, 'battle': 0, 'fear': 0, 'weapons': 0, 'catastrophe': 0, 'forest fire': 0, 'ruin': 0, 'buildings burning': 0, 'blaze': 0, 'fatal': 0, 'airplane accident': 0, 'sinking': 0, 'electrocute': 0, 'rescue': 0, 'hostage': 0, 'massacre': 0, 'traumatised': 0, 'trouble': 0, 'screaming': 0, 'suicide bomb': 0, 'annihilated': 0, 'loud bang': 0, 'floods': 0, 'quarantine': 0, 'obliterate': 0, 'cliff fall': 0, 'body bagging': 0, 'snowstorm': 0, 'whirlwind': 0, 'disaster': 0, 'bleeding': 0, 'razed': 0, 'famine': 0, 'armageddon': 0, 'wreck': 0, 'thunder': 0, 'wrecked': 0, 'crush': 0, 'burned': 0, 'sirens': 0, 'explosion': 0, 'screams': 0, 'rescuers': 0, 'bridge collapse': 0, 'survivors': 0, 'fatality': 0, 'earthquake': 0, 'accident': 0, 'flames': 0, 'detonate': 0, 'mass murderer': 0, 'smoke': 0, 'military': 0, 'stretcher': 0, 'blizzard': 0, 'danger': 0, 'bloody': 0, 'panicking': 0, 'drowned': 0, 'eyewitness': 0, 'devastation': 0, 'bush fires': 0, 'army': 0, 'heat wave': 0, 'emergency plan': 0, 'tragedy': 0, 'collided': 0, 'survive': 0, 'injury': 0, 'riot': 0, 'attacked': 0, 'fire': 0, 'bioterrorism': 0, 'wounds': 0, 'quarantined': 0, 'drown': 0, 'hailstorm': 0, 'casualties': 0, 'mass murder': 0, 'demolish': 0, 'collision': 0, 'pandemonium': 0, 'sandstorm': 0, 'electrocuted': 0, 'landslide': 0, 'flooding': 0, 'mayhem': 0, 'rainstorm': 0, 'demolition': 0, 'blew up': 0, 'hijacking': 0, 'siren': 0, 'terrorist': 0, 'inundated': 0, 'damage': 0, 'lava': 0, 'devastated': 0, 'forest fires': 0, 'outbreak': 0, 'terrorism': 0, 'panic': 0, 'detonation': 0, 'injured': 0, 'deluged': 0, 'windstorm': 0, 'thunderstorm': 0, 'hazard': 0, 'crushed': 0, 'crashed': 0, 'blood': 0, 'buildings on fire': 0, 'destruction': 0, 'deluge': 0, 'weapon': 0, 'sinkhole': 0, 'aftershock': 0, 'ambulance': 0, 'wreckage': 0, 'desolate': 0, 'blown up': 0, 'fatalities': 0, 'injuries': 0, 'bombing': 0, 'structural failure': 0, 'death': 0, 'police': 0, 'destroyed': 0, 'engulfed': 0, 'crash': 0, 'emergency': 0, 'inundation': 0, 'collide': 0, 'blight': 0, 'destroy': 0, 'dust storm': 0, 'mudslide': 0, 'displaced': 0, 'arsonist': 0, 'nuclear reactor': 0, 'blazing': 0, 'lightning': 0, 'explode': 0, 'tsunami': 0, 'burning buildings': 0, 'volcano': 0, 'hijack': 0, 'refugees': 0, 'derailment': 0, 'harm': 0, 'hail': 0, 'bioterror': 0, 'hurricane': 0, 'trauma': 0, 'evacuation': 0, 'cyclone': 0, 'epicentre': 0, 'nuclear disaster': 0, 'hostages': 0, 'obliteration': 0, 'suicide bomber': 0, 'drowning': 0, 'derailed': 0, 'threat': 0, 'apocalypse': 0, 'chemical emergency': 0, 'burning': 0, 'obliterated': 0, 'screamed': 0, 'fire truck': 0, 'seismic': 0, 'wildfire': 0, 'emergency services': 0, 'attack': 0, 'storm': 0, 'catastrophic': 0, 'twister': 0, 'evacuated': 0, 'natural disaster': 0, 'collapse': 0, 'trapped': 0, 'war zone': 0, 'exploded': 0, 'collapsed': 0, 'oil spill': 0, 'evacuate': 0, 'typhoon': 0, 'dead': 0, 'survived': 0, 'first responders': 0, 'keyword': 0, 'radiation emergency': 0, 'annihilation': 0, 'deaths': 0, 'rubble': 0, 'ablaze': 0, 'meltdown': 0, 'casualty': 0, 'body bags': 0, 'upheaval': 0, 'flood': 0, 'demolished': 0, 'rioting': 0, 'hellfire': 0, 'curfew': 0, 'hazardous': 0, 'tornado': 0, 'desolation': 0, 'flattened': 0, 'drought': 0, 'derail': 0, 'arson': 0, 'rescued': 0, 'suicide bombing': 0, 'wild fires': 0, 'wounded': 0}


# for issue in count.keys():
#     print issue

Beispiel #10

Datei anzeigen

def main():
    conf = SparkConf().setAppName("pyspark read")
    sc = SparkContext(conf=conf)
    mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/estreaming.splash')
    print(mongo_rdd.first())

Beispiel #11

Datei anzeigen

Datei: spark_batch_map_reduce.py Projekt: IchrakMars/Tweets-Tech-Trends

    "#AugmentedReality", "#BigData", "#DevOps"
]
hashtags_lowercased = [
    "#ai", "#artificialintelligence", "#machinelearning", "#ml",
    "#deeplearning", "#dl", "#datamining", "#vr", "#virtualreality", "#ar",
    "#augmentedreality", "#bigdata", "#devops"
]


def get_hashtag(x):
    if x.lower() in hashtags_lowercased:
        for hashtag in hashtags:
            if hashtag.lower() == x.lower():
                return hashtag


# Important: activate pymongo_spark.
pymongo_spark.activate()

conf = SparkConf().setAppName('SparkBatch').setMaster('local[2]')
sc = SparkContext(conf=conf)

#Reading
mongo_rdd = sc.mongoRDD('mongodb://*****:*****@ds129540.mlab.com:29540/bigdata.tweets')\
          .map(lambda x: json.loads(x['value'])['full_text'])\
          .flatMap(lambda x: x.split()) \
          .filter(lambda x: x.lower() in hashtags_lowercased) \
          .map(lambda x: (get_hashtag(x),1)) \
          .reduceByKey(lambda x,y:x+y) \
          .saveToMongoDB('mongodb://*****:*****@ds129540.mlab.com:29540/bigdata.hashtags')

Beispiel #12

Datei anzeigen

Datei: main.py Projekt: naveen4509/LinkPipeline

            print doc["_id"]
            
                        
            
        
if __name__ == '__main__':
    pymongo_spark.activate()
    start_time=datetime.time()
    conf = (SparkConf()
    .setAppName("LinkingPipeLine"))
    
    sc = SparkContext(conf=conf,pyFiles=['/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/LinkPipeMethods.py',
                                            '/home/naveen/spark-1.6.0-bin-hadoop2.6/linkPipe/tagText.py'])
    
    #setting up RDD    
    rdd = sc.mongoRDD('mongodb://10.1.1.5:27017/GaugeDB.test_judgments')
        #Filtering Criteria for RDD
    filterRDD = rdd.filter(lambda x : True if x["pipefinal"] == 1  else False)
    
    #Config paths
    path1 = "/usr/linkPipModels/CRF-Model-OnlyCodes"
    path2 = "/usr/linkPipModels/CRF-Model-OnlyTitles"
    path3 = "/usr/linkPipModels/VectorSpaceTitles_word.p"
    path4 = "/usr/linkPipModels/VectorSpaceCodes.p"
    path5 = "/usr/linkPipModels/Tf-IdfOnlytitles.p"
    path6 = "/usr/linkPipModels/Tf-IdfCitationCodes.p"
    path7 = "/usr/linkPipModels/TitleClassifier.p"
    path8 ="/usr/linkPipModels/JournalDictForStep1.p"
    path9 ="/usr/linkPipModels/JournalDictForStep2.p"