def sample_text_elements(workspace_id: str, dataset_name: str, sample_size: int, filter_func, remove_duplicates=False, random_state=None) -> Tuple[Sequence, int]: """ :param sample_size: if None, return all elements without sampling :param workspace_id: if None no labels info would be used or output :param dataset_name: :param filter_func: :param remove_duplicates: :param random_state: """ corpus_df = get_ds_in_memory(dataset_name, remove_duplicates).copy() random_state = random_state if random_state else 0 if workspace_id: random_state = sum([ord(c) for c in workspace_id]) + random_state labels_dict = get_labels(workspace_id, dataset_name).copy() corpus_df['category_to_label'] = [ dict(labels_dict[u]) for u in corpus_df['uri'] ] corpus_df = filter_func(corpus_df) hit_count = len(corpus_df) if sample_size and hit_count > sample_size: corpus_df = corpus_df.sample(n=sample_size, random_state=random_state) result_text_elements = [ TextElement(*t) for t in corpus_df[ TextElement.get_field_names()].itertuples(index=False, name=None) ] return result_text_elements, hit_count
def get_text_elements(dataset_name: str, uris: Iterable) -> Sequence[TextElement]: corpus_df = get_ds_in_memory(dataset_name) uris = list(uris) corpus_df = corpus_df.loc[corpus_df['uri'].isin(uris)] text_elements = [ TextElement(*t) for t in corpus_df[ TextElement.get_field_names()].itertuples(index=False, name=None) ] return text_elements
def get_all_text_elements(self, dataset_name: str) -> List[TextElement]: """ Return a List of all TextElement in the given dataset_name. :param dataset_name: the name of the dataset from which the TextElement should be retrieved. """ return [ TextElement(*t) for t in logic.get_ds_in_memory(dataset_name) [TextElement.get_field_names()].itertuples(index=False, name=None) ]
def generate_simple_doc(dataset_name, doc_id=0, add_duplicate=False): sentences = [ 'Document Title is Super Interesting', 'First sentence is not that attractive.', 'The second one is a bit better.', 'Last sentence offers a promising view for the future!' ] if add_duplicate: sentences.append('Document Title is Super Interesting') text_elements = [] start_span = 0 for idx, sentence in enumerate(sentences): end_span = start_span + len(sentence) text_elements.append( TextElement(uri=URI_SEP.join([dataset_name, str(doc_id), str(idx)]), text=sentence, span=[(start_span, end_span)], metadata={}, category_to_label={})) start_span = end_span + 1 doc = Document(uri=dataset_name + URI_SEP + str(doc_id), text_elements=text_elements, metadata={}) return doc
def _process(self): if not os.path.isfile(self.get_raw_data_path()): raise Exception( f'{self.dataset_part.name.lower()} set file for dataset "{self.dataset_name}" not found' ) all_categories = self._get_all_categories() df = pd.read_csv(self.get_raw_data_path(), encoding=self.encoding) texts_categories_contexts_doc_ids = [ (text, category) for text, category in list( zip(df[self.text_col], df[self.label_col])) ] texts_categories_contexts_doc_ids = \ add_column_or_default_to_zip(texts_categories_contexts_doc_ids, df, self.context_col, None) texts_categories_contexts_doc_ids = \ add_column_or_default_to_zip(texts_categories_contexts_doc_ids, df, self.doc_id_col, 0) uri_to_category_labels = [] prev_doc_id = None element_id = -1 text_span_start = 0 doc_uri_to_text_elements = defaultdict(list) for idx, (text, category, context, doc_id) in enumerate(texts_categories_contexts_doc_ids): if prev_doc_id is not None and prev_doc_id != doc_id: element_id = -1 text_span_start = 0 doc_uri = self.dataset_name + '_' + self.dataset_part.name.lower( ) + URI_SEP + str(doc_id) element_id += 1 text_element_uri = doc_uri + URI_SEP + str(element_id) metadata = {METADATA_CONTEXT_KEY: context} if context else {} text_element = TextElement(uri=text_element_uri, text=text, span=[(text_span_start, (text_span_start + len(text)))], metadata=metadata, category_to_label={}) doc_uri_to_text_elements[doc_uri].append(text_element) category_to_label_dict = \ {cat: Label(labels=self.LABEL_POSITIVE, metadata={}) if cat == category else Label(labels=self.LABEL_NEGATIVE, metadata={}) for cat in all_categories} uri_to_category_labels.append( (text_element_uri, category_to_label_dict)) prev_doc_id = doc_id text_span_start += (len(text) + 1) self.documents = [ Document(uri=doc_uri, text_elements=text_elements, metadata={}) for doc_uri, text_elements in doc_uri_to_text_elements.items() ] self.uri_category_labels = uri_to_category_labels
def generate_simple_doc(dataset_name, doc_id=0): sentences = ['with label true', 'with label false', 'no label'] text_elements = [] start_span = 0 for idx, sentence in enumerate(sentences): end_span = start_span + len(sentence) text_elements.append( TextElement(uri=URI_SEP.join([dataset_name, str(doc_id), str(idx)]), text=sentence, span=[(start_span, end_span)], metadata={}, category_to_label={})) start_span = end_span + 1 doc = Document(uri=dataset_name + URI_SEP + str(doc_id), text_elements=text_elements, metadata={}) return doc
def _process(self): raw_data_file_path = self.get_raw_data_path() all_categories = self._get_all_categories() text_elements = [] uri_to_category_labels = [] with open(raw_data_file_path, 'r', encoding='latin-1') as f: labels_text_split = [ line.rstrip().split(' ', 1) for line in f.readlines() ] texts = [ elem[1].split(self.sep_for_idx)[0] for elem in labels_text_split ] categories_tuple = [(elem[0].split(':')[0], elem[0]) for elem in labels_text_split] texts_and_labels = list( zip(texts, categories_tuple)) # [(text, (coarse-grained, fine-grained))] for text_element_id, (text, categories) in enumerate(texts_and_labels): uri = self.doc_uri + URI_SEP + str(text_element_id) text_elements.append( TextElement(uri=uri, text=text, span=[(0, len(text))], metadata={}, category_to_label={})) category = categories[ 1] if self.use_fine_grained_labels else categories[0] category_to_label_dict = { cat: Label(labels=self.LABEL_POSITIVE, metadata={}) if cat == category else Label(labels=self.LABEL_NEGATIVE, metadata={}) for cat in all_categories } uri_to_category_labels.append((uri, category_to_label_dict)) self.documents = [ Document(uri=self.doc_uri, text_elements=text_elements, metadata={}) ] self.uri_category_labels = uri_to_category_labels