def convert_file( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) header = True with open(input_path, "r") as in_f, open(output_path, "w") as out_f: for line in tqdm(in_f): if header: header = False continue sentence, tokens = pd.read_csv(StringIO(line), header=None, usecols=[0, 1]).values[0] tokens = eval(tokens) dict_line = line_to_dict(sentence, tokens) eg = dict_line if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path}")
def convert(lang: str, input_path: Path, training_path: Path, validation_path: Path): nlp = spacy.blank(lang) db_train = DocBin() db_test = DocBin() df = pd.read_csv(input_path) df_si = df[df.is_humor > 0] train_si = df_si.sample(frac=0.8, random_state=31416) test_si = df_si.drop(train_si.index) df_no = df[df.is_humor == 0] train_no = df_no.sample(frac=0.8, random_state=31416) test_no = df_no.drop(train_no.index) db_train = genera(nlp, train_si.text, { 'humor': 1.0, 'no_humor': 0.0 }, db_train) db_train = genera(nlp, train_no.text, { 'humor': 0.0, 'no_humor': 1.0 }, db_train) db_train.to_disk(training_path) db_test = genera(nlp, test_si.text, { 'humor': 1.0, 'no_humor': 0.0 }, db_test) db_test = genera(nlp, test_no.text, { 'humor': 0.0, 'no_humor': 1.0 }, db_test) db_test.to_disk(validation_path)
def convert(json_path, output): db = DocBin() for line in srsly.read_jsonl(json_path): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output)
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) db = DocBin() for line in srsly.read_jsonl(input_path): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output_path)
def convert(output_path): global nlp db = DocBin() for line in srsly.read_jsonl("db.json"): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output_path)
def generate_corpus(nlp): directory_path = path.join('data') corpus_path = Path(path.join(directory_path, file_name) + ".spacy") raw_path = Path(path.join(directory_path, file_name) + ".jsonl") if exists(corpus_path): return Corpus(corpus_path)(nlp) vulnerabilities = [] with open(raw_path) as file: for line in file.readlines(): vulnerability = loads(line) vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])}) corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"]) for vulnerability in vulnerabilities: document = nlp.make_doc(vulnerability['description'].lower()) #print(vulnerability) #print(len(document)) #iob = [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc] #biluo = iob_to_biluo(iob) #print(biluo) #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']]) #document.set_ents(list(document.ents)) tags = offsets_to_biluo_tags(document, vulnerability['entities']) entities = biluo_tags_to_spans(document, tags) document.set_ents(entities) ''' Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens. ''' #print(document.has_annotation(1)) #ID of "SOFTWARE" # passt alles! ents = list(document.ents) for i, _ in enumerate(ents): print(ents[i].label_) print(ents[i].text) print('\n') print('\nOK\n') #exit() corpus.add(document) print(len(corpus)) print(list(corpus.get_docs(nlp.vocab))) corpus.to_disk(corpus_path) if exists(corpus_path): return Corpus(corpus_path)(nlp)
def make_docbin(user_key, language=None): docbin = DocBin(store_user_data=True) serial = make_serial(user_key, language='??') file_key = file_key_from_principal_key(principal_key=user_key, serial=serial, language='ll', principal_type='u') path = path_from_file_key(file_key) docbin.to_disk(path) return file_key, docbin
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) in_db = DocBin().from_disk(input_path) out_db = DocBin() logging.info(f"Read {len(in_db)} documents from {input_path}.") for doc in in_db.get_docs(nlp.vocab): new_doc = nlp.make_doc(doc.text) new_doc.user_data = doc.user_data new_doc.ents = doc.ents out_db.add(new_doc) out_db.to_disk(output_path)
def main( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin() data_tuples = ((eg["text"], eg) for eg in srsly.read_jsonl(input_path)) for doc, eg in nlp.pipe(data_tuples, as_tuples=True): # doc.cats = {category: 0 for category in CATEGORIES} doc.cats[eg["label"]] = 1 doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def prepare_data( params: Params, verbose: bool = True, ) -> Dict[str, Doc]: """ return a single spacy doc for each age. warning: if corpus binary is not on disk already, it will be saved to disk. this means the corpus should never be modified - else, the binary will also contain unexpected modifications """ # try loading transcripts from disk fn = params.corpus_name + '.spacy' bin_path = configs.Dirs.corpora / fn if bin_path.exists(): doc_bin = DocBin().from_disk(bin_path) docs = list(doc_bin.get_docs(nlp.vocab)) # load raw transcripts + process them else: print( f'WARNING: Did not find binary file associated with {params.corpus_name}. Preprocessing corpus...' ) transcripts = load_transcripts(params) docs: List[Doc] = [doc for doc in nlp.pipe(transcripts)] # WARNING: only save to disk if we know that corpus has not been modified doc_bin = DocBin(docs=docs) doc_bin.to_disk(bin_path) # group docs by age ages = load_ages(params) if len(ages) != len(docs): raise RuntimeError(f'Num docs={len(docs)} and num ages={len(ages)}') age2docs = {} for age in SortedSet(ages): if age == EXCLUDED_AGE: continue docs_at_age = [docs[n] for n, ai in enumerate(ages) if ai == age] age2docs[age] = docs_at_age if verbose: print( f'Processed {len(age2docs[age]):>6} transcripts for age={age}') # combine all documents at same age age2doc = {} for age, docs in age2docs.items(): doc_combined = Doc.from_docs(docs) age2doc[age] = doc_combined print(f'Num tokens at age={age} is {len(doc_combined):,}') return age2doc
def write_spacy_docs( data: Doc | Iterable[Doc], filepath: types.PathLike, *, make_dirs: bool = False, format: str = "binary", attrs: Optional[Iterable[str]] = None, store_user_data: bool = False, ) -> None: """ Write one or more ``Doc`` s to disk at ``filepath`` in binary or pickle format. Args: data: A single ``Doc`` or a sequence of ``Doc`` s to write to disk. filepath: Path to file on disk to which data will be written. make_dirs: If True, automatically create (sub)directories if not already present in order to write ``filepath``. format ({"pickle", "binary"}): Format of the data written to disk. If "binary", uses :class:`spacy.tokens.DocBin` to serialie data; if "pickle", uses python's stdlib ``pickle``. .. warning:: When writing docs in pickle format, all the docs in ``data`` must be saved as a list, which means they're all loaded into memory. Mind your RAM usage, especially when writing many docs! attrs: List of attributes to serialize if ``format`` is "binary". If None, spaCy's default values are used; see here: https://spacy.io/api/docbin#init store_user_data: If True, write :attr`Doc.user_data` and the values of custom extension attributes to disk; otherwise, don't. Raises: ValueError: if format is not "binary" or "pickle" """ if isinstance(data, Doc): data = [data] if format == "binary": kwargs = {"docs": data, "store_user_data": store_user_data} if attrs is not None: kwargs["attrs"] = list(attrs) docbin = DocBin(**kwargs) docbin.to_disk(filepath) elif format == "pickle": if store_user_data is False: data = _clear_docs_user_data(data) with io_utils.open_sesame(filepath, mode="wb", make_dirs=make_dirs) as f: pickle.dump(list(data), f, protocol=-1) else: raise ValueError( errors.value_invalid_msg("format", format, {"binary", "pickle"}))
def convert(input_path, output_path, lang='en'): nlp = spacy.blank(lang) db = DocBin() for text, annot in srsly.read_json(input_path): doc = nlp.make_doc(text) ents = [] for start, end, label in annot["entities"]: span = doc.char_span(start, end, label=label) if span is None: print("Skipping entity") else: ents.append(span) doc.ents = ents db.add(doc) db.to_disk(output_path)
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) db = DocBin() for text, annot in srsly.read_json(input_path): doc = nlp.make_doc(text) ents = [] for start, end, label in annot["entities"]: span = doc.char_span(start, end, label=label) if span is None: msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n" warnings.warn(msg) else: ents.append(span) doc.ents = ents db.add(doc) db.to_disk(output_path)
def removefrom_docbin(file_key, obj_type, obj_id): file_key, docbin = get_docbin(file_key=file_key) language = language_from_file_key(file_key) model = settings.LANGUAGE_MODELS[language] index = i = 0 docs = [] for doc in list(docbin.get_docs(model.vocab)): if doc._.obj_type == obj_type and doc._.obj_id == obj_id: index = i else: docs.append(doc) i += 1 delete_docbin(file_key) docbin = DocBin(docs=docs, store_user_data=True) path = path_from_file_key(file_key) docbin.to_disk(path) return index
def main( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) for eg in srsly.read_json(input_path): print(eg) doc = nlp(eg[0]) doc.ents = [ doc.char_span(s[0], s[1], label=s[2]) for s in eg[1].get("entities", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def to_spacy(self, df, file_path=None): """ Function to convert dataframe returned by annotator into spacy . Parameters ---------- df (pandas DataFrame): Dataframe returned by the annotator (see Annotate()). file_path (str): Filepath (including filename) to save the .spacy file to. Returns ------- Spacy docbin if a user wants to combine additional training data """ if (not isinstance(df, pd.DataFrame)): raise TypeError("Pass the pandas dataframe returned by annotate()") if file_path and (not isinstance(file_path, str)): raise TypeError("The file_path must be a string or None") if file_path is None: file_path = os.path.join(os.getcwd(), 'annotations.spacy') db = DocBin() training_data = [ant for ant in df['annotations'].tolist() if ant] for text, annotations in training_data: ents = [] doc = self.nlp(text) for start, end, label in annotations['entities']: span = doc.char_span(start, end, label=label) ents.append(span) # Drop overlapping spans. Note: when spans overlap, the (first) longest span is preferred over shorter spans. # See: https://spacy.io/api/top-level#util.filter_spans # TODO: alert users that some spans have been dropped. doc.ents = filter_spans(ents) db.add(doc) db.to_disk(file_path) print(f"Spacy file saved to: {file_path}") return db
def descrip_to_spacy(data: pd.DataFrame, output_path: str) -> None: "Takes in dataframe with description and label to save DocBin to disk" tuples = data.apply(lambda row: (strip_html_tags(row["description"]), row["fraud"]), axis=1).to_list() nlp = spacy.blank("en") db = DocBin() for doc, label in nlp.pipe(tuples, as_tuples=True): if label: doc.cats["FRAUD"] = 1 doc.cats["NOTFRAUD"] = 0 else: doc.cats["FRAUD"] = 0 doc.cats["NOTFRAUD"] = 1 db.add(doc) db.to_disk(output_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-i', '--input', required=True, help= 'The path to the input dataset to convert to SpaCy\'s binary format.') parser.add_argument( '-o', '--output', required=True, help= 'The path to the input dataset to convert to SpaCy\'s binary format.') parser.add_argument( '-c', '--categories', required=True, help='The path to the .json file which contains the categories.') args = parser.parse_args() # Read the .json file which contains the list of categories categories_dict = _read_categories(path_to_file=args.categories) # Define an empty SpaCy pipeline for English language nlp = spacy.blank('en') # Read and parse the sentences with their labels records = read_tsv_file(args.input) # Convert the (sentence, label) pairs to SpaCy Doc object docs = [ convert_record(nlp, record_dict, categories_dict) for record_dict in records ] # Create the SpaCy's data structure that contains the SpaCy's Doc(s) doc_bin = DocBin(docs=docs) # Save it as .spacy file format doc_bin.to_disk(args.output) print('INFO: saved as .spacy binary format the {} [{} documents].'.format( args.input.split('/')[1].split('.')[0], len(docs)))
def convert_file( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) for eg in tqdm(srsly.read_jsonl(input_path)): if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def main( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) for idx, eg in enumerate(srsly.read_jsonl(input_path)): if idx % 10000 == 0: print(f"converted {idx} sentences") doc = nlp(eg["text"]) spans_from_json = eg.get("spans", []) spans_objects = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in spans_from_json ] spans_objects = filter_spans(spans_objects) doc.ents = spans_objects doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def main(input_path: Path = typer.Argument(..., exists=True, dir_okay=False)): print("Read params.yaml...") with open("params.yaml", "r") as fd: params = yaml.safe_load(fd) dev_size = params["train"]["corpora"]["dev_size"] shuffle_seed = params["train"]["corpora"]["shuffle_seed"] print(f"...read dev_size={dev_size}, shuffle_seed={shuffle_seed}") print("Read annotations...") corpus = list(srsly.read_jsonl(input_path)) print(f"...read {len(corpus)} texts") print("Convert into documents...") docs = [] nlp = spacy.blank("en") for eg in corpus: if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] docs.append(doc) print(f"...converted {len(docs)} documents") print("Split into train and dev...") train, dev = train_test_split(docs, test_size=dev_size, random_state=shuffle_seed, shuffle=True) print(f"...split into {len(train)} train and {len(dev)} dev documents") print("Write serialized documents...") for split, data in [("train", train), ("dev", dev)]: output_path = input_path.with_suffix(f".{split}.spacy") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"], docs=data) doc_bin.to_disk(output_path) print(f"...wrote {output_path}")
def generateSpacyFiles(training_data): filenames = ['./train.spacy', './evaluation.spacy'] for i in range(len(filenames)): nlp = spacy.blank('de') # load a new spacy model db = DocBin() # create a DocBin object for text, annot in tqdm(training_data[i]): # data in previous format doc = nlp.make_doc(text) # create doc object from text ents = [] for start, end, label in annot[ "entities"]: # add character indexes span = doc.char_span(start, end, label=label, alignment_mode="contract") if span is None: print("Skipping entity") else: ents.append(span) doc.ents = ents # label the text with the ents db.add(doc) db.to_disk(filenames[i]) # save the docbin object
def df_to_spacy(df, outfile, model='en_core_web_md'): """ Convert a dataframe into a .spacy training file """ nlp = spacy.load(model) # nlp = spacy.blank("en") db = DocBin() # create a DocBin object for index, row in df.iterrows(): doc = nlp.make_doc(row['data']) # create doc object from text ents = [] for start, end, label in row['label']: # add character indexes span = doc.char_span(start, end, label=label, alignment_mode="contract") if span is None: print("Skipping entity") else: ents.append(span) doc.ents = ents # label the text with the ents db.add(doc) db.to_disk(outfile) # save the docbin object print(f'Successfully wrote \'{outfile}\' to disk')
def build_training_file(self): # TODO: divide into training/dev nlp = spacy.blank('en') doc_bin = DocBin() annotated_text = self._db.get_training_corpus() for text, entities in annotated_text: doc = nlp.make_doc(text) ents = list() for start, stop, label in entities: span = doc.char_span(start, stop, label=label, alignment_mode="contract") if span is None: log.debug(f'{label} entity from {start} to {stop} was not ' + \ 'valid and was discarded.') else: ents.append(span) doc.ents = ents doc_bin.add(doc) # TODO: remove underscore when dev data is available data_uri = f'{self._OUT_DIR}/_train.spacy' log.info(f'Now saving training file to disk at {data_uri}') doc_bin.to_disk(data_uri)
def main(json_loc: Path, train_file: Path, dev_file: Path, test_file: Path, test_split=0.189, train_split=0.709): """Creating the corpus from the Prodigy annotations.""" Doc.set_extension("rel", default={}) vocab = Vocab() docs = {"train": [], "dev": [], "test": []} ids = {"train": set(), "dev": set(), "test": set()} count_all = {"train": 0, "dev": 0, "test": 0} count_pos = {"train": 0, "dev": 0, "test": 0} long_rel_count = 0 #how many relations are longer error_count_rel = 0 #how often is something different than ARGO, ARG1, ARG with json_loc.open("r", encoding="utf8") as jsonfile: length_training_data = len([ True for line in jsonfile if json.loads(line)["answer"] == "accept" ]) msg.info(f"Number of accepted recipes: {length_training_data}") with json_loc.open("r", encoding="utf8") as jsonfile: for line in jsonfile: example = json.loads(line) #one recipe span_starts = set() if example["answer"] == "accept": neg = 0 pos = 0 try: # Parse the tokens -> example["tokens"] = list of dicts words = [t["text"] for t in example["tokens"] ] #list containing all words spaces = [ t["ws"] for t in example["tokens"] ] #list containing ws is behind word (ws = True/False) doc = Doc(vocab, words=words, spaces=spaces) # Parse the entities spans = example[ "spans"] #list of dicts containing entities entities = [] span_end_to_start = {} ents_dict = {} for span in spans: #every detected span entity = doc.char_span( span["start"], span["end"], label=span["label"] ) #"start" = wievielter character ist start character des spans im doc span_end_to_start[span["token_end"]] = span[ "token_start"] #end_token of span as key for start_token (start token = wievielter token in doc) entities.append(entity) #appended to list span_starts.add(span["token_start"]) #added to set ents_dict[span["token_start"]] = (span["label"], span["token_start"]) doc.ents = entities #entity list assigned as doc entites # Parse the relations rels = {} # create token combinations for x1 in span_starts: #VERBS_TO_OTHER 1a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 1a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 1b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... #VERBS_TO_OTHER 1b else: for x2 in span_starts: #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... relations = example[ "relations"] #relations is list of dict for relation in relations: # the 'head' and 'child' annotations refer to the end token in the span # but we want the first token start = span_end_to_start[relation[ "head"]] #wievielter token ist start token des head end = span_end_to_start[relation[ "child"]] #wievielter token ist start token des child label = relation["label"] #DETAILED_ARGS 1a if DETAILED_ARGS == True: if label == "ARG0": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG0[ents_dict[end][ 0]] #assign new label based on span type elif label == "ARG1": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG1[ents_dict[end][0]] elif label == "ARG": if ents_dict[end][0] in ["Z", "TOOL"]: if ents_dict[end][0] == "Z": label = "Arg0Z" elif ents_dict[end][0] == "TOOL": label = "Arg1Tool" else: label = MAP_LABELS_ARG[ents_dict[end][0]] else: error_count_rel += 1 #DETAILED_ARGS 1b else: label = MAP_LABELS_STANDARD[ label] #MAP_LABELS = dict containing label as key # Positive relations are being added try: if label not in rels[( start, end )]: #check if label already exists for token combination rels[( start, end )][label] = 1.0 #initialize label as new key with value 1.0 pos += 1 #positive case except: long_rel_count += 1 #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb) pass # The annotation is complete, so fill in zero's where the data is missing for x1 in span_starts: #VERBS_TO_OTHER 2a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 2a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 2b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 3a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 3b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #span combination with label as key gets 0 as value #VERBS_TO_OTHER 2b else: for x2 in span_starts: #DIFF_FRONT_BACK 3a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 4a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 4b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DIFF_FRONT_BACK 3b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 5a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 5b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #print(rels) doc._.rel = rels # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}} # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list) if pos > 0: recipe_id = example["_input_hash"] if len(docs["train"]) < round( train_split * length_training_data): ids["train"].add(recipe_id) docs["train"].append(doc) count_pos["train"] += pos count_all["train"] += pos + neg elif len(docs["test"]) < round( test_split * length_training_data): ids["test"].add(recipe_id) docs["test"].append(doc) count_pos["test"] += pos count_all["test"] += pos + neg else: ids["dev"].add(recipe_id) docs["dev"].append(doc) count_pos["dev"] += pos count_all["dev"] += pos + neg except KeyError as e: msg.fail( f"Skipping doc because of key error: {e} in {example['_input_hash']}" ) msg.info( f"{long_rel_count} relations have been cut because tokens are too far apart." ) docbin = DocBin(docs=docs["train"], store_user_data=True) docbin.to_disk(train_file) msg.info( f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, " f"{count_pos['train']}/{count_all['train']} pos instances.") docbin = DocBin(docs=docs["dev"], store_user_data=True) docbin.to_disk(dev_file) msg.info( f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, " f"{count_pos['dev']}/{count_all['dev']} pos instances.") docbin = DocBin(docs=docs["test"], store_user_data=True) docbin.to_disk(test_file) msg.info( f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, " f"{count_pos['test']}/{count_all['test']} pos instances.")
# nlp.pipe([texts]) is way faster than running nlp(text) for each text # as_tuples allows us to pass in a tuple, the first one is treated as text # the second one will get returned as it is. #for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)): for doc, label in nlp.pipe(data, as_tuples=True): # we need to set the (text)cat(egory) for each document doc.cats["positive"] = label # put them into a nice list docs.append(doc) return docs # we are so far only interested in the first 5000 reviews # this will keep the training time short. # In practice take as much data as you can get. # you can always reduce it to make the script even faster. num_texts = 5000 # first we need to transform all the training data train_docs = make_docs(train_data[:num_texts]) # then we save it in a binary file to disc doc_bin = DocBin(docs=train_docs) doc_bin.to_disk("./data/train.spacy") # repeat for validation data valid_docs = make_docs(valid_data[:num_texts]) doc_bin = DocBin(docs=valid_docs) doc_bin.to_disk("./data/valid.spacy")
'FALAR SOBRE SEMEAR': 0.0, 'FALAR SOBRE ADA': 0.0, 'MÚSICA': 0.0, 'SOLETRAR': 0.0, 'DANÇAR': 1.0 }) doc = nlp.make_doc(texto) doc.cats = dic.copy() #db.add(doc) arq.write("{\"text\":\"" + texto + "\",\"cats\":") arq.write(json.dumps(dic, ensure_ascii=False)) arq.write("}\n") #print(doc.cats) #baseDeDadosFinal.append([texto, dic.copy()]) arq.close() convert("db.json", "db.spacy") db.to_disk("db") #Após rodar o PLN.py, rodar a linha de comando abaixo #python -m spacy train config.conf --output training/ --paths.train db.spacy --paths.dev db.spacy --nlp.lang "pt" --gpu-id -1 #python -m spacy debug data config.conf --paths.train db --paths.dev db --nlp.lang "pt" #python -m spacy init fill-config config.conf #Site de referência: #https://towardsdatascience.com/sarcasm-text-classification-using-spacy-in-python-7cd39074f32e
#create training set nlp = spacy.blank("en") # load a new spacy model db = DocBin() # create a DocBin object for text, annot in tqdm(trainSet): # data in previous format doc = nlp.make_doc(text) # create doc object from text ents = [] for start, end, label in annot["entities"]: # add character indexes span = doc.char_span(start, end, label=label, alignment_mode="contract") if span is None: print("Skipping entity") else: ents.append(span) doc.ents = ents # label the text with the ents db.add(doc) db.to_disk("./train.spacy") # save the docbin object #create validation set nlp = spacy.blank("en") # load a new spacy model db = DocBin() # create a DocBin object for text, annot in tqdm(valSet): # data in previous format doc = nlp.make_doc(text) # create doc object from text ents = [] for start, end, label in annot["entities"]: # add character indexes span = doc.char_span(start, end, label=label, alignment_mode="contract") if span is None: print("Skipping entity") else: ents.append(span) doc.ents = ents # label the text with the ents db.add(doc)
train = pd.read_csv("./data/sentiment/norec_sentence/train.txt", delimiter="\t", header=None) #type: ignore dev = pd.read_csv("./data/sentiment/norec_sentence/dev.txt", delimiter="\t", header=None) #type: ignore test = pd.read_csv("./data/sentiment/norec_sentence/test.txt", delimiter="\t", header=None) #type: ignore for sid, (label, sent) in train.iterrows(): doc = nlp(sent) doc.user_data["gold"] = label train_doc_bin.add(doc) train_doc_bin.to_disk("./data/sentiment/norec_sentence/train.docbin") for sid, (label, sent) in dev.iterrows(): doc = nlp(sent) doc.user_data["gold"] = label dev_doc_bin.add(doc) dev_doc_bin.to_disk("./data/sentiment/norec_sentence/dev.docbin") for sid, (label, sent) in test.iterrows(): doc = nlp(sent) doc.user_data["gold"] = label test_doc_bin.add(doc) test_doc_bin.to_disk("./data/sentiment/norec_sentence/test.docbin") ################################################################## # Weak supervision
import json import spacy from spacy.matcher import Matcher from spacy.tokens import Span, DocBin with open("exercises/en/iphone.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = spacy.blank("en") matcher = Matcher(nlp.vocab) # Adicionar padrões ao comparador pattern1 = ([{"LOWER": "iphone"}, {"LOWER": "x"}]) pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}] matcher.add("GADGET", [pattern1, pattern2]) docs = [] for doc in nlp.pipe(TEXTS): matches = matcher(doc) spans = [ Span(doc, start, end, label=match_id) for match_id, start, end in matches ] doc.ents = spans docs.append(doc) doc_bin = DocBin(docs=docs) doc_bin.to_disk("./train.spacy")