Ejemplo n.º 1
0
def insertAll(time, time2):
    registry = MetadataRegistry()
    registry.registerReader('arXivRaw', arXivRaw_reader)
    client = Client(URL, registry)
    client.updateGranularity()
    list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2)
    errors = 0
    for a in list:
        #a = list.next()
        try:
            title = '\n'.join(a[1]['title'])
            sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ')
            abstract = '\n'.join(a[1]['abstract'])
            url = 'http://arxiv.org/abs/' + a[1]['id'][0]
            date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z')
            authors = a[1]['authors'][0]# '; '.join(a[1]['keynames'])
            abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2)
            print title
            print sr2
            print abstract
            print url
            print date
            print authors
            insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2)
        except:
            print 'ERROR'
            print a
            errors = errors+1
    print 'Completed with %s errors' % errors
def arxiv_oai_scraper(subject, start, end, sleep_time=0):

    base_url = "http://export.arxiv.org/oai2"
    output = list()

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(base_url, registry)
    client.updateGranularity()

    records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end)

    for _, md, _ in records:

        # print md.getField("title")
        # checks for the case in 2010 when there is no title for something
        if md is not None:

            txt_dict = {"title": md["title"],
                    "abstract": md["description"],
                    "date": md["date"],
                    "subject": md["subject"],
                    "url": md["identifier"],
                    "authors": md['creator']}

            output.append(txt_dict)

        time.sleep(sleep_time)

    return output
Ejemplo n.º 3
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, metadata_registry)
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 4
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 5
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl)
         )
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         yield record
Ejemplo n.º 6
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl))
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         # Unit test hotfix
         header, metadata, about = record
         # Fix pyoai returning a "b'...'" string for py3k
         if isinstance(metadata, str) and metadata.startswith("b'"):
             metadata = ast.literal_eval(metadata).decode("utf-8")
         yield (header, metadata, about)
Ejemplo n.º 7
0
 def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs):
     # Generator to yield records from baseUrl in the given metadataPrefix
     # Add metatdataPrefix to args
     kwargs['metadataPrefix'] = metadataPrefix
     client = Client(baseUrl, self._mdRegistry)
     # Check that baseUrl actually represents an OAI-PMH target
     try:
         client.identify()
     except IndexError:
         raise NotOAIPMHBaseURLException(
             "{0} does not appear to be an OAI-PMH compatible base URL"
             "".format(baseUrl)
         )
     # Check server timestamp granularity support
     client.updateGranularity()
     for record in client.listRecords(**kwargs):
         # Unit test hotfix
         header, metadata, about = record
         # Fix pyoai returning a "b'...'" string for py3k
         if isinstance(metadata, str) and metadata.startswith("b'"):
             metadata = ast.literal_eval(metadata).decode("utf-8")
         yield (header, metadata, about)
Ejemplo n.º 8
0
def now():
    return datetime.now().ctime()

print >>sys.stderr, "beginning @", now()


    

URL = "http://citeseerx.ist.psu.edu/oai2"

registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)

client = Client(URL, registry)
client.updateGranularity()

store = Store()

if len(sys.argv) > 1:
    start = datetime.strptime(sys.argv[1], '%Y-%m-%d') #2011-10-27, for instance
elif store.last():
    start = store.last()
else:
    start = client.identify().earliestDatestamp()

#try this and see if it works; if it does resumption tokens right, this should work fine.


chunk = timedelta(days=1)
oneday = timedelta(days=1)
    def run(self):
        # Check that ElasticSearch is alive
        self.check_index()

        # If the user specified the --REBUILD flag, recreate the index
        if self.options['rebuild']:
            self.rebuild_index()

        # Connect to the repository
        registry = MetadataRegistry()
        registry.registerReader(self.settings["metadata_format"], self.settings["metadata_reader"])

        client = Client(self.settings["uri"], registry)
        identity = client.identify()

        print "Connected to repository: %s" % identity.repositoryName()

        # got to update granularity or we barf with: 
        # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
        client.updateGranularity()

        # Initialise some variables
        batcher = Batch.Batch()
        total_records = 0
        start = time.time()
        
        # Now do the synchonisation
        
        # If the user specified an identifier, then synchronise this record
        if (self.options['identifier'] is not None):
            total_records += self.synchronise_record(client, batcher, self.options['identifier'])
        else:
            # Else, synchronise using the date-range provided by the user, or failing that, 
            # the date-range based on the last sync

            # Get the synchronisation config record
            synchronisation_config = self.get_synchronisation_config()

            
            if self.options["from_date"] is not None:
                # If the user specified a from-date argument, use it
                from_date = self.options["from_date"] # already a date (not a datetime)
            elif synchronisation_config is not None and "to_date" in synchronisation_config:
                # Else read the last synchronised to_date from the config, and add on a day
                from_date = dateutil.parser.parse(synchronisation_config["to_date"]).date() + timedelta(days=1)
            else:
                # Else use the default_from_date in the config
                from_date = dateutil.parser.parse(self.settings['default_from_date']).date()

            if self.options["to_date"] is not None:
                to_date = self.options["to_date"] # already a date (not a datetime)
            else:
                to_date = (date.today() - timedelta(days=1))
            
            # Force the from_date to use time 00:00:00
            from_date = datetime.combine(from_date, _time(hour=0, minute=0, second=0, microsecond=0))

            # Force the to_date to use time 23:59:59
            to_date = datetime.combine(to_date, _time(hour=23, minute=59, second=59, microsecond=0))


            print "Synchronising from %s - %s" % (from_date, to_date)

            while from_date < to_date:
                next_date = datetime.combine(from_date.date() + timedelta(days=(self.settings['delta_days'] - 1)), _time(hour=23, minute=59, second=59, microsecond=0))
                number_of_records = self.synchronise_period(client, batcher, from_date, next_date)
                batcher.clear() #Store the records in elasticsearch
                self.put_synchronisation_config(from_date, next_date, number_of_records)
                from_date += timedelta(days=(self.settings['delta_days']))
                total_records += number_of_records

                # Pause so as not to get banned.
                to = 20
                print "Sleeping for %i seconds so as not to get banned." % to
                time.sleep(to)

            
        # Store the records in the index
        batcher.clear()
        
        # Print out some statistics
        time_spent = time.time() - start
        print 'Total time spent: %d seconds' % (time_spent)

        if time_spent > 0.001: # careful as its not an integer
            print 'Total records synchronised: %i records (%d records/second)' % (total_records, (total_records/time_spent))
        else:
            print 'Total records synchronised: %i records' % (total_records)
        return total_records

        sys.exit()
Ejemplo n.º 10
0
def oaiSpider(subject="hep-ex", section="physics", start=None, end=None, sleep_time = 0):
    '''
    Pull articles using the Open Archives Initiative protocol
    
    subject    - String defining the subset of the main section
    section    - String defining the main section (typically physics or nothing)
    start      - A datetime.datetime object restricting the starting date of returned articles
    end        - A datetime.datetime object restricting the ending date of the returned articles
    sleep_time - A number specifying how many ms to wait between the record queries
    
    Examples

       oaiSpider("hep-ex", "physics")
       ==> returns all HEP experiment articles
       
       oaiSpider("cs", "", datetime(2011,06,24))
       ==> returns all computer science articles submitted after June 24th, 2011
       
       oaiSpider("hep-ph", "physics", None, datetime(2011,06, 24))
       ==> returns all HEP phenomenology articles submitted before June 24th, 2011

    Returns a list of dictionaries containing the article metadata
    '''

    from oaipmh.client import Client
    from oaipmh.metadata import MetadataRegistry, oai_dc_reader

    base_url = "http://export.arxiv.org/oai2"
    output = []

    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)
    client = Client(base_url, registry)
    client.updateGranularity()

    if section == None:
        section = ""
    if len(section) > 0 and section[-1] != ":":
        section += ":"

    # sets = client.listSets()
    # for entry in sets:
    #     print entry
    
    ### OAIPMH module sucks donkey balls
    # Causes some error when I use the from_ or until keys
    records = client.listRecords(metadataPrefix='oai_dc'
                                 , set='%s%s' % (section, subject)
                                 , from_=start
                                 #, from_=datestamp
                                 , until=end
                                 )
    
    counter = 0
    
    for (header, metadata, aux) in records:
        
        print counter

        # for key in  metadata._map.keys():
        #     print key, metadata[key]

        output.append({"title"    : cleanText(metadata["title"][0]),
                       "abstract" : cleanText(metadata["description"][0]),
                       "date"     : convertDate(max(metadata["date"])),
                       "subject"  : subject,
                       "url"      : metadata["identifier"][0],
                       "authors"  : "; ".join( metadata['creator']),
                       })

        print output[-1]
        counter += 1
        
        # break
        # if counter > 15:
        #     break
        time.sleep(sleep_time)

    return output
class OpenBeeldenDataLoader(DataLoader):

	def __init__(self):
		self.ES_INDEX = 'et_openbeelden'
		self.ES_DOC_TYPE = 'mediaresource'
		self.es_local = Elasticsearch(host=LTV_ES_SETTINGS['host'], port=LTV_ES_SETTINGS['port'])

	def loadMediaResourceData(self, resourceUri, clientIP, loadAnnotations):
		mediaResource = MediaResource(resourceUri)

		#load the annotations (only named entities in this case)
		mediaResource = self.__getAllAnnotationsOfResource(mediaResource)

		#fetch the video metadata
		mediaResource = self.__getAllVideoMetadata(mediaResource, clientIP)

		#transform the mediaresource object to JSON and return it
		resp = simplejson.dumps(mediaResource, default=lambda obj: obj.__dict__)
		return resp

	def loadMediaResources(self, provider):#ignores provider
		return self.loadOpenBeeldenItemsFromES(0, [])


	def loadOpenBeeldenItemsFromES(self, offset, videos):
		query = {
			"query": {
				"match_all": {}
			},
  			"fields": [],
  			"from": offset,
			"size": 300
		}
		resp = self.es_local.search(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, body=query, timeout="10s")
		if resp and len(resp['hits']['hits']) > 0:
			print len(resp['hits']['hits'])
			vids = []
			for hit in resp['hits']['hits']:
				vid = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=hit['_id'])
				vids.append(vid['_source'])
			for vd in vids:
				video = {
					'id' : vd['id'].replace(':', '_'),
					'title' : '; '.join(vd['title']),
					'date' : '; '.join(vd['date']),
					'locator' : self.__getMediumByExtension(vd['medium'], 'mp4'),
					'thumbUrl' : self.__getMediumByExtension(vd['medium'], 'png'),
					'thumbBaseUrl' : ''
				}
				videos.append(video)
			self.loadOpenBeeldenItemsFromES(offset + 300, videos)
		return {'videos' : videos}


	def __getMediumByExtension(self, mediums, extension):
		poster = None
		for m in mediums:
			if m.find('.%s' % extension) != -1:
				poster = m
				break
		return poster

	def __getAllAnnotationsOfResource(self, mediaResource):
		nes = []
		"""
		nes.append(NamedEntity(
			label,
			entityType=LinkedTVDataUtils.getNEType(DCType, RDFType, OWLSameAs),
			subTypes=LinkedTVDataUtils.getDCTypes(DCType),
			disambiguationURL=OWLSameAs,
			start=start,
			end=end,
			annotationURI=annotationURI,
			relevance=r,
			confidence=c
			)
		)
		"""
		mediaResource.setNamedEntities(nes)

		return mediaResource

	def __getAllVideoMetadata(self, mediaResource, clientIP):
		print mediaResource.getId()
		vd = self.es_local.get(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=mediaResource.getId().replace('_', ':'))
		if vd:
			vd = vd['_source']
			mediaResource.setVideoMetadata(vd)

			mediaResource.setPlayoutUrl(self.__getMediumByExtension(vd['medium'], 'mp4'))

			#set the video metadata in the mediaresource
			mediaResource.setTitle('; '.join(vd['title']))
			mediaResource.setDate('; '.join(vd['date']))
			mediaResource.setThumbBaseUrl(None)
			mediaResource.setSrtUrl(None)
			mediaResource.setSubtitles(None)

		return mediaResource

	def setupOAIPMHConnection(self):
		oai_oi_reader = MetadataReader(
		    fields={
		    'title':       ('textList', 'oai_oi:oi/oi:title/text()'),
		    'alternative':       ('textList', 'oai_oi:oi/oi:alternative/text()'),
		    'creator':     ('textList', 'oai_oi:oi/oi:creator/text()'),
		    'subject':     ('textList', 'oai_oi:oi/oi:subject/text()'),
		    'description': ('textList', 'oai_oi:oi/oi:description/text()'),
		    'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'),
		    'publisher':   ('textList', 'oai_oi:oi/oi:publisher/text()'),
		    'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'),
		    'date':        ('textList', 'oai_oi:oi/oi:date/text()'),
		    'type':        ('textList', 'oai_oi:oi/oi:type/text()'),
		    'extent':        ('textList', 'oai_oi:oi/oi:extent/text()'),
		    'medium':        ('textList', 'oai_oi:oi/oi:medium/text()'),
		    'identifier':  ('textList', 'oai_oi:oi/oi:identifier/text()'),
		    'source':      ('textList', 'oai_oi:oi/oi:source/text()'),
		    'language':    ('textList', 'oai_oi:oi/oi:language/text()'),
		    'references':    ('textList', 'oai_oi:oi/oi:references/text()'),
		    'spatial':    ('textList', 'oai_oi:oi/oi:spatial/text()'),
		    'attributionName':    ('textList', 'oai_oi:oi/oi:attributionName/text()'),
		    'attributionURL':    ('textList', 'oai_oi:oi/oi:attributionURL/text()'),
		    'license':      ('textList', 'oai_oi:oi/oi:license/text()')
		    },

		    namespaces={
		    	'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/',
		    	'oi': 'http://www.openbeelden.nl/oai/'
		    }
		)

		URL = 'http://www.openbeelden.nl/feeds/oai/'

		#Initialize the OAI client
		self.registry = MetadataRegistry()
		self.registry.registerReader('oai_oi', oai_oi_reader)
		self.client = Client(URL, self.registry)

		#Test if the connection to the OAI-PMH provider works
		x = self.client.updateGranularity()
		x = self.client.identify()
		print 'identity %s' % x.repositoryName()
		print 'identity %s' % x.protocolVersion()
		print 'identity %s' % x.baseURL()

		"""
		for s in client.listSets():
			print s
		"""

		#initialize the OpenSKOSHandler
		self.openSKOSHandler = OpenSKOSHandler()

	def reindex(self, provider = None):
		setupOAIPMHConnection()
		i = 0
		extent = None
		item = None
		identifier = None
		for rec in self.client.listRecords(metadataPrefix=u'oai_oi', set=u'beeldengeluid'):#stichting_natuurbeelden, beeldengeluid
			header, metadata, about = rec

			extent = metadata.getField('extent')[0]
			item = {
				'id' : header.identifier(),
				'identifier' : self.getFieldData(metadata, 'identifier'),
				'title' : self.getFieldData(metadata, 'title'),
				'alternative' : self.getFieldData(metadata, 'alternative'),
				'creator' : self.getFieldData(metadata, 'creator'),
				'subject' : self.getFieldData(metadata, 'subject'),
				'description' : self.getFieldData(metadata, 'description'),
				'abstract' : self.getFieldData(metadata, 'abstract'),
				'publisher' : self.getFieldData(metadata, 'publisher'),
				'contributor' : self.getFieldData(metadata, 'contributor'),
				'date' : self.getFieldData(metadata, 'date'),
				'date2' : header.datestamp(),
				'type' : self.getFieldData(metadata, 'type'),
				'extent' : extent,
				'medium' : self.getFieldData(metadata, 'medium'),
				'source' : self.getFieldData(metadata, 'source'),
				'language' : self.getFieldData(metadata, 'language'),
				'references' : self.getFieldData(metadata, 'references'),
				'spatial' : self.getFieldData(metadata, 'spatial'),
				'attributionName' : self.getFieldData(metadata, 'attributionName'),
				'attributionURL' : self.getFieldData(metadata, 'attributionURL'),
				'license' : self.getFieldData(metadata, 'license'),
				'durationSecs' : self.getExtentInSeconds(extent)
			}
			self.es_local.index(index=self.ES_INDEX, doc_type=self.ES_DOC_TYPE, id=header.identifier(), body=item)

		print 'Done'
		return True

	def getGTAATermsBySubjects(self, subject, spatial):
		"""Get the GTAA terms related to the subject"""
		gtaaTerms = self.getGTAATermsBasedOnSubjectAndLocation(subject, spatial)

		"""If there is no identifier, try to fetch the taakID from iMMix ES"""
		if identifier == '' and source != '':
			print 'No taakID!'
			taakID = self.getTaakIDBasedOnSource(source)
			if taakID:
				print 'assigning taakID to the identifier'
				identifier = taakID
		return gtaaTerms

	def getFieldData(self, metadata, fn):
		#return '; '.join(metadata.getField(fn))
		return metadata.getField(fn)

	def getExtentInSeconds(self, ext):
		secs = 0
		if ext and ext.find('PT') != -1:
			ext = ext[2:len(ext)]
			if ext.find('H') != -1:
				secs = int(ext[0:ext.find('H')]) * 3600
				ext = ext[ext.find('H') + 1:len(ext)]
			if ext.find('M') != -1:
				secs = int(ext[0:ext.find('M')]) * 60
				ext = ext[ext.find('M') + 1:len(ext)]
			if ext.find('S') != -1:
				secs += int(ext[0:ext.find('S')])
		return secs

	def secsToTimeString(self, secs):
		h = m = s = 0
		while secs - 3600 >= 0:
			h += 1
			secs -= 3600
		while secs - 60 >= 0:
			m += 1
			secs -= 60
		return '%d:%d:%d' % (h, m, s)
	#Run de hoofdfunctie

	def getGTAATermsBasedOnSubjectAndLocation(self, subject, spatial):
		subs = None
		locs = None
		os_res = None
		gtaaExact = []
		gtaaFuzzy = []

		"""First add GTAA terms based on the subject(s)"""
		if subject:
			subs = subject.split(';')
			for s in subs:
				 os_res = self.openSKOSHandler.autoCompleteTable(s)
				 if os_res:
					 if len(os_res) == 1:
						gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value']))
					 elif len(os_res) > 1:
						for r in os_res:
							gtaaFuzzy.append('%s,%s' % (r['label'], r['value']))

		"""Append the GTAA terms based on the location(s)"""
		if spatial:
			locs = spatial.split(';')
			for l in locs:
				 os_res = self.openSKOSHandler.autoCompleteTable(l, 'http://data.beeldengeluid.nl/gtaa/GeografischeNamen')
				 if os_res:
					 if len(os_res) == 1:
						gtaaExact.append('%s,%s' % (os_res[0]['label'], os_res[0]['value']))
					 elif len(os_res) > 1:
						for r in os_res:
							gtaaFuzzy.append('%s,%s' % (r['label'], r['value']))

		return (gtaaExact, gtaaFuzzy)

	def getImmixMetadataBasedOnDrager(self, drager):
		global tot
		query = {"query":{"bool":{"must":[{"query_string":{"default_field":"positie.dragernummer","query":"\"%s\"" % drager}}],"must_not":[],"should":[]}}}
		#print query
		resp = es_local.search(index="search_expressie", doc_type="searchable_expressie", body=query, timeout="10s")
		#print resp
		if resp and resp['hits']['total'] == 1:
			for hit in resp['hits']['hits']:
				return hit
		elif resp and resp['hits']['total'] > 1:
			print 'more than one hit...'
			print resp
		return None

	def getTaakIDBasedOnSource(self, source):
		dragernrs = str(source).split('; ')
		drager = None

		"""Get the drager from the source (sometimes there are two, but most of the times they are the same)"""
		if len(dragernrs) == 2:
			if dragernrs[0] != dragernrs[1]:
				print dragernrs
				print '>>>>>>>>>> There are two dragers...'
			else:
				drager = dragernrs[0]
		else:
			drager = dragernrs[0]

		"""Try to find the taakID related to the drager"""
		if drager:
			md = self.getImmixMetadataBasedOnDrager(drager)
			if md:
				taakID = md['_source']['expressie']['niveau']['taakID']
				if taakID:
					print 'Found a taakID: %s\t%s' % (drager, taakID)
					return taakID
		return None
    def run(self):
        # Check that ElasticSearch is alive
        self.check_index()

        # If the user specified the --REBUILD flag, recreate the index
        if self.options['rebuild']:
            self.rebuild_index()

        # Connect to the repository
        registry = MetadataRegistry()
        registry.registerReader(self.settings["metadata_format"],
                                self.settings["metadata_reader"])

        client = Client(self.settings["uri"], registry)
        identity = client.identify()

        print "Connected to repository: %s" % identity.repositoryName()

        # got to update granularity or we barf with:
        # oaipmh.error.BadArgumentError: Max granularity is YYYY-MM-DD:2003-04-10T00:00:00Z
        client.updateGranularity()

        # Initialise some variables
        batcher = Batch.Batch()
        total_records = 0
        start = time.time()

        # Now do the synchonisation

        # If the user specified an identifier, then synchronise this record
        if (self.options['identifier'] is not None):
            total_records += self.synchronise_record(
                client, batcher, self.options['identifier'])
        else:
            # Else, synchronise using the date-range provided by the user, or failing that,
            # the date-range based on the last sync

            # Get the synchronisation config record
            synchronisation_config = self.get_synchronisation_config()

            if self.options["from_date"] is not None:
                # If the user specified a from-date argument, use it
                from_date = self.options[
                    "from_date"]  # already a date (not a datetime)
            elif synchronisation_config is not None and "to_date" in synchronisation_config:
                # Else read the last synchronised to_date from the config, and add on a day
                from_date = dateutil.parser.parse(
                    synchronisation_config["to_date"]).date() + timedelta(
                        days=1)
            else:
                # Else use the default_from_date in the config
                from_date = dateutil.parser.parse(
                    self.settings['default_from_date']).date()

            if self.options["to_date"] is not None:
                to_date = self.options[
                    "to_date"]  # already a date (not a datetime)
            else:
                to_date = (date.today() - timedelta(days=1))

            # Force the from_date to use time 00:00:00
            from_date = datetime.combine(
                from_date, _time(hour=0, minute=0, second=0, microsecond=0))

            # Force the to_date to use time 23:59:59
            to_date = datetime.combine(
                to_date, _time(hour=23, minute=59, second=59, microsecond=0))

            print "Synchronising from %s - %s" % (from_date, to_date)

            while from_date < to_date:
                next_date = datetime.combine(
                    from_date.date() +
                    timedelta(days=(self.settings['delta_days'] - 1)),
                    _time(hour=23, minute=59, second=59, microsecond=0))
                number_of_records = self.synchronise_period(
                    client, batcher, from_date, next_date)
                batcher.clear()  #Store the records in elasticsearch
                self.put_synchronisation_config(from_date, next_date,
                                                number_of_records)
                from_date += timedelta(days=(self.settings['delta_days']))
                total_records += number_of_records

                # Pause so as not to get banned.
                to = 20
                print "Sleeping for %i seconds so as not to get banned." % to
                time.sleep(to)

        # Store the records in the index
        batcher.clear()

        # Print out some statistics
        time_spent = time.time() - start
        print 'Total time spent: %d seconds' % (time_spent)

        if time_spent > 0.001:  # careful as its not an integer
            print 'Total records synchronised: %i records (%d records/second)' % (
                total_records, (total_records / time_spent))
        else:
            print 'Total records synchronised: %i records' % (total_records)
        return total_records

        sys.exit()