def dedup(): existing = [ i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url'] ] d = Deduplicator(existing) visit('cn/images.csv', d) print(d.count)
from __future__ import unicode_literals, print_function from tsammalexdata.util import visit class Visitor(object): def __call__(self, index, row): return row[:2] + ['synonyms' if index == 0 else ''] + row[2:] if __name__ == '__main__': visit('taxa.csv', Visitor())
def select(p): shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv')) visit('cn/staged_images.csv', Selector()) print( len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
def rewrite(p): visit('cn/' + p, JSON2CSV(data_file('cn', 'images.json')))
def do_check(fname): existing = {(i['taxa__id'], i['tags']): i for i in csv_items('images.csv') if 'edmond' in i['source_url']} visit(fname, RemoveUploaded(existing))
def select(p): shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv')) visit('cn/staged_images.csv', Selector()) print(len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
def dedup(): existing = [i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url']] d = Deduplicator(existing) visit('cn/images.csv', d) print(d.count)
class Visitor(object): def __init__(self): self.cols = {} with open(data_file('images_md.json'), 'rb') as fp: self.md = json.load(fp) self.count = 0 def __call__(self, index, row): if index == 0: self.cols = {col: i for i, col in enumerate(row)} return row url = URL(row[self.cols['src']]) try: for filename in url.path_segments(): if filename in self.md: if self.md[filename].get('source_url'): row[self.cols['source']] = self.md[filename]['source_url'] self.count += 1 break except IndexError: pass return row if __name__ == '__main__': v = Visitor() visit('images.csv', v) print(v.count)
self.edmond_urls = file_urls(data_file('Edmond.xml')) self.cols = {} self.count = 0 def __call__(self, index, row): if index == 0: self.cols = {col: i for i, col in enumerate(row)} return row _id = row[self.cols['id']] if _id in self.edmond_urls: row[self.cols['source_url']] = self.edmond_urls[_id]['full'] self.count += 1 else: # # FIXME: check whether source_url is an Edmond image URL, if not, upload the # image to Edmond, insert the URL here! Depends on the imeji API being # available on Edmond. # print(_id, row) return row if __name__ == '__main__': with open(data_file('Edmond.xml'), 'w', encoding='utf8') as fp: fp.write(requests.get(URL).text) v = Visitor() visit(sys.argv[1] if len(sys.argv) > 1 else 'images.csv', v) print(v.count)
Corresponding items in the Tsammalex collection on Edmond are detected by matching the id of the image against filename or checksum attribute of the Edmond item. """ def __init__(self): self.edmond_urls = file_urls(data_file('Edmond.xml')) self.cols = {} def __call__(self, index, row): if index == 0: self.cols = {col: i for i, col in enumerate(row)} return row _id = row[self.cols['id']] if _id in self.edmond_urls: row[self.cols['source_url']] = self.edmond_urls[_id]['full'] else: # # FIXME: check whether source_url is an Edmond image URL, if not, upload the # image to Edmond, insert the URL here! Depends on the imeji API being # available on Edmond. # print(_id, row) return row if __name__ == '__main__': with open(data_file('Edmond.xml'), 'w', encoding='utf8') as fp: fp.write(requests.get(URL).text) visit(sys.argv[1] if len(sys.argv) > 1 else 'images.csv', Visitor())